diff --git a/.github/workflows/basic-tests-linux-uv.yml b/.github/workflows/basic-tests-linux-uv.yml index d2b9cc4..a0e75b8 100644 --- a/.github/workflows/basic-tests-linux-uv.yml +++ b/.github/workflows/basic-tests-linux-uv.yml @@ -66,7 +66,7 @@ jobs: shell: bash run: | source .venv/bin/activate - pytest ch02/05_bpe-from-scratch/tests/tests.py + pytest ch02/05_bpe-from-scratch/tests.py - name: Test Selected Bonus Materials shell: bash diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index 9853ac4..bb754b7 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 39, "id": "8c9bc9e4-120f-4bac-8fa6-6523c568d12e", "metadata": {}, "outputs": [ @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 40, "id": "6c586945-d459-4f9a-855d-bf73438ef0e3", "metadata": {}, "outputs": [ @@ -138,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 41, "id": "0d5b61d9-79a0-48b4-9b3e-64ab595c5b01", "metadata": {}, "outputs": [ @@ -382,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 42, "id": "3e4a15ec-2667-4f56-b7c1-34e8071b621d", "metadata": {}, "outputs": [], @@ -809,7 +809,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 71, "id": "51872c08-e01b-40c3-a8a0-e8d6a773e3df", "metadata": {}, "outputs": [ @@ -817,7 +817,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "the-verdict.txt already exists in ./the-verdict.txt\n" + "the-verdict.txt already exists in ../01_main-chapter-code/the-verdict.txt\n" ] } ], @@ -848,7 +848,7 @@ " \"the-verdict.txt\"\n", " ),\n", " filename=\"the-verdict.txt\",\n", - " search_dirs=\".\"\n", + " search_dirs=[\"ch02/01_main-chapter-code/\", \"../01_main-chapter-code/\", \".\"]\n", ")\n", "\n", "with open(verdict_path, \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n", @@ -867,7 +867,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 46, "id": "027348fd-d52f-4396-93dd-38eed142df9b", "metadata": {}, "outputs": [], @@ -886,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 47, "id": "f705a283-355e-4460-b940-06bbc2ae4e61", "metadata": {}, "outputs": [ @@ -913,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 48, "id": "3da42d1c-f75c-4ba7-a6c5-4cb8543d4a44", "metadata": {}, "outputs": [ @@ -947,7 +947,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 49, "id": "e1db5cce-e015-412b-ad56-060b8b638078", "metadata": {}, "outputs": [ @@ -967,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 50, "id": "78249752-38d7-47b9-b259-912bcc093dc4", "metadata": {}, "outputs": [ @@ -987,7 +987,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 51, "id": "0331d37d-49a3-44f7-9aa9-9834e0938741", "metadata": {}, "outputs": [ @@ -1007,7 +1007,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 52, "id": "1ed1b344-f7d4-4e9e-ac34-2a04b5c5b7a8", "metadata": {}, "outputs": [ @@ -1043,7 +1043,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 53, "id": "da0e1faf-1933-43d9-b681-916c282a8f86", "metadata": {}, "outputs": [ @@ -1061,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 54, "id": "8b690e83-5d6b-409a-804e-321c287c24a4", "metadata": {}, "outputs": [ @@ -1087,7 +1087,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 55, "id": "2b9e6289-92cb-4d88-b3c8-e836d7c8095f", "metadata": {}, "outputs": [ @@ -1142,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 56, "id": "c7056cb1-a9a3-4cf6-8364-29fb493ae240", "metadata": {}, "outputs": [ @@ -1152,7 +1152,7 @@ "'This is some text.'" ] }, - "execution_count": 16, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1165,7 +1165,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 57, "id": "37bc6753-8f35-4ec7-b23e-df4a12103cb4", "metadata": {}, "outputs": [ @@ -1175,7 +1175,7 @@ "'This is some text with \\n newline characters.'" ] }, - "execution_count": 17, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1204,7 +1204,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 58, "id": "955181cb-0910-4c6a-9c22-d8292a3ec1fc", "metadata": {}, "outputs": [], @@ -1215,7 +1215,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 59, "id": "6e5ccfe7-ac67-42f3-b727-87886a8867f1", "metadata": {}, "outputs": [], @@ -1235,7 +1235,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 60, "id": "00d9bf8f-756f-48bf-81b8-b890e2c2ef13", "metadata": {}, "outputs": [ @@ -1253,7 +1253,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 61, "id": "e7addb64-2892-4e1c-85dd-4f5152740099", "metadata": {}, "outputs": [ @@ -1263,7 +1263,7 @@ "'This is some text with \\n newline characters.'" ] }, - "execution_count": 21, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -1293,7 +1293,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 72, "id": "b45b4366-2c2b-4309-9a14-febf3add8512", "metadata": {}, "outputs": [ @@ -1310,7 +1310,7 @@ "# Download files if not already present in this directory\n", "\n", "# Define the directories to search and the files to download\n", - "search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n", + "search_directories = [\"ch02/02_bonus_bytepair-encoder/gpt2_model/\", \"../02_bonus_bytepair-encoder/gpt2_model/\", \".\"]\n", "\n", "files_to_download = {\n", " \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n", diff --git a/ch02/05_bpe-from-scratch/tests/tests.py b/ch02/05_bpe-from-scratch/tests.py similarity index 90% rename from ch02/05_bpe-from-scratch/tests/tests.py rename to ch02/05_bpe-from-scratch/tests.py index 4ed2a16..0a36b1f 100644 --- a/ch02/05_bpe-from-scratch/tests/tests.py +++ b/ch02/05_bpe-from-scratch/tests.py @@ -10,7 +10,7 @@ import tiktoken def import_definitions_from_notebook(fullname, names): """Loads function definitions from a Jupyter notebook file into a module.""" - path = os.path.join(os.path.dirname(__file__), "..", fullname + ".ipynb") + path = os.path.join(os.path.dirname(__file__), fullname + ".ipynb") path = os.path.normpath(path) if not os.path.exists(path): @@ -42,12 +42,30 @@ def imported_module(): return import_definitions_from_notebook(fullname, names) +@pytest.fixture(scope="module") +def verdict_file(imported_module): + """Fixture to handle downloading The Verdict file.""" + download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) + + verdict_path = download_file_if_absent( + url=( + "https://raw.githubusercontent.com/rasbt/" + "LLMs-from-scratch/main/ch02/01_main-chapter-code/" + "the-verdict.txt" + ), + filename="the-verdict.txt", + search_dirs=["ch02/01_main-chapter-code/", "../01_main-chapter-code/", "."] + ) + + return verdict_path + + @pytest.fixture(scope="module") def gpt2_files(imported_module): """Fixture to handle downloading GPT-2 files.""" download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) - search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"] + search_directories = ["ch02/02_bonus_bytepair-encoder/gpt2_model/", "../02_bonus_bytepair-encoder/gpt2_model/", "."] files_to_download = { "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe", "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json" @@ -58,22 +76,11 @@ def gpt2_files(imported_module): return paths -def test_tokenizer_training(imported_module): +def test_tokenizer_training(imported_module, verdict_file): BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) - download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) - tokenizer = BPETokenizerSimple() - verdict_path = download_file_if_absent( - url=( - "https://raw.githubusercontent.com/rasbt/" - "LLMs-from-scratch/main/ch02/01_main-chapter-code/" - "the-verdict.txt" - ), - filename="the-verdict.txt", - search_dirs="." - ) - with open(verdict_path, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/ + with open(verdict_file, "r", encoding="utf-8") as f: # added ../01_main-chapter-code/ text = f.read() tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"})