mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-25 23:11:23 +00:00 
			
		
		
		
	formatting for consistency with production chapter
This commit is contained in:
		
							parent
							
								
									217ab77a6c
								
							
						
					
					
						commit
						ea9da3a89c
					
				| @ -153,6 +153,24 @@ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 2, | ||||
|    "id": "40f9d9b1-6d32-485a-825a-a95392a86d79", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import os\n", | ||||
|     "import urllib.request\n", | ||||
|     "\n", | ||||
|     "if not os.path.exists(\"the-verdict.txt\"):\n", | ||||
|     "    url = (\"https://raw.githubusercontent.com/rasbt/\"\n", | ||||
|     "           \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n", | ||||
|     "           \"the-verdict.txt\")\n", | ||||
|     "    file_path = \"the-verdict.txt\"\n", | ||||
|     "    urllib.request.urlretrieve(url, file_path)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "id": "8a769e87-470a-48b9-8bdb-12841b416198", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -185,7 +203,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "execution_count": 4, | ||||
|    "id": "737dd5b0-9dbb-4a97-9ae4-3482c8c04be7", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -216,7 +234,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 4, | ||||
|    "execution_count": 5, | ||||
|    "id": "ea02489d-01f9-4247-b7dd-a0d63f62ef07", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -244,7 +262,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 5, | ||||
|    "execution_count": 6, | ||||
|    "id": "4d8a6fb7-2e62-4a12-ad06-ccb04f25fed7", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -272,7 +290,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 6, | ||||
|    "execution_count": 7, | ||||
|    "id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -310,7 +328,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "execution_count": 8, | ||||
|    "id": "8c567caa-8ff5-49a8-a5cc-d365b0a78a99", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -338,7 +356,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 8, | ||||
|    "execution_count": 9, | ||||
|    "id": "35db7b5e-510b-4c45-995f-f5ad64a8e19c", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -388,7 +406,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 9, | ||||
|    "execution_count": 10, | ||||
|    "id": "7fdf0533-5ab6-42a5-83fa-a3b045de6396", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -409,7 +427,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 10, | ||||
|    "execution_count": 11, | ||||
|    "id": "77d00d96-881f-4691-bb03-84fec2a75a26", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -427,7 +445,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 11, | ||||
|    "execution_count": 12, | ||||
|    "id": "e1c5de4a-aa4e-4aec-b532-10bb364039d6", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -522,7 +540,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 12, | ||||
|    "execution_count": 13, | ||||
|    "id": "f531bf46-7c25-4ef8-bff8-0d27518676d5", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -534,7 +552,9 @@ | ||||
|     "    \n", | ||||
|     "    def encode(self, text):\n", | ||||
|     "        preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n", | ||||
|     "        preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", | ||||
|     "        preprocessed = [\n", | ||||
|     "            item.strip() for item in preprocessed if item.strip()\n", | ||||
|     "        ]\n", | ||||
|     "        ids = [self.str_to_int[s] for s in preprocessed]\n", | ||||
|     "        return ids\n", | ||||
|     "        \n", | ||||
| @ -573,7 +593,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 13, | ||||
|    "execution_count": 14, | ||||
|    "id": "647364ec-7995-4654-9b4a-7607ccf5f1e4", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -588,7 +608,8 @@ | ||||
|    "source": [ | ||||
|     "tokenizer = SimpleTokenizerV1(vocab)\n", | ||||
|     "\n", | ||||
|     "text = \"\"\"\"It's the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.\"\"\"\n", | ||||
|     "text = \"\"\"\"It's the last he painted, you know,\" \n", | ||||
|     "           Mrs. Gisburn said with pardonable pride.\"\"\"\n", | ||||
|     "ids = tokenizer.encode(text)\n", | ||||
|     "print(ids)" | ||||
|    ] | ||||
| @ -603,7 +624,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 14, | ||||
|    "execution_count": 15, | ||||
|    "id": "01d8c8fb-432d-4a49-b332-99f23b233746", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -613,7 +634,7 @@ | ||||
|        "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 14, | ||||
|      "execution_count": 15, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
| @ -624,7 +645,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 15, | ||||
|    "execution_count": 16, | ||||
|    "id": "54f6aa8b-9827-412e-9035-e827296ab0fe", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -634,7 +655,7 @@ | ||||
|        "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 15, | ||||
|      "execution_count": 16, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
| @ -712,7 +733,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 16, | ||||
|    "execution_count": 17, | ||||
|    "id": "d5767eff-440c-4de1-9289-f789349d6b85", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -723,9 +744,9 @@ | ||||
|      "traceback": [ | ||||
|       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||||
|       "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)", | ||||
|       "Cell \u001b[0;32mIn[16], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m      3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", | ||||
|       "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m      7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m      8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", | ||||
|       "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m      8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", | ||||
|       "Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m      3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", | ||||
|       "Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m      7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m      8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m      9\u001b[0m     item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m     10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", | ||||
|       "Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m      8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m      9\u001b[0m     item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m     10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", | ||||
|       "\u001b[0;31mKeyError\u001b[0m: 'Hello'" | ||||
|      ] | ||||
|     } | ||||
| @ -750,7 +771,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 17, | ||||
|    "execution_count": 18, | ||||
|    "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -766,7 +787,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 18, | ||||
|    "execution_count": 19, | ||||
|    "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -776,7 +797,7 @@ | ||||
|        "1161" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 18, | ||||
|      "execution_count": 19, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
| @ -787,7 +808,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 19, | ||||
|    "execution_count": 20, | ||||
|    "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -818,7 +839,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 20, | ||||
|    "execution_count": 21, | ||||
|    "id": "948861c5-3f30-4712-a234-725f20d26f68", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -831,8 +852,10 @@ | ||||
|     "    def encode(self, text):\n", | ||||
|     "        preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n", | ||||
|     "        preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", | ||||
|     "        preprocessed = [item if item in self.str_to_int \n", | ||||
|     "                        else \"<|unk|>\" for item in preprocessed]\n", | ||||
|     "        preprocessed = [\n", | ||||
|     "            item if item in self.str_to_int \n", | ||||
|     "            else \"<|unk|>\" for item in preprocessed\n", | ||||
|     "        ]\n", | ||||
|     "\n", | ||||
|     "        ids = [self.str_to_int[s] for s in preprocessed]\n", | ||||
|     "        return ids\n", | ||||
| @ -854,7 +877,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 21, | ||||
|    "execution_count": 22, | ||||
|    "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -879,7 +902,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 22, | ||||
|    "execution_count": 23, | ||||
|    "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -904,7 +927,7 @@ | ||||
|        " 7]" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 22, | ||||
|      "execution_count": 23, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
| @ -915,7 +938,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 23, | ||||
|    "execution_count": 24, | ||||
|    "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -925,7 +948,7 @@ | ||||
|        "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 23, | ||||
|      "execution_count": 24, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
| @ -957,7 +980,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 24, | ||||
|    "execution_count": 25, | ||||
|    "id": "ede1d41f-934b-4bf4-8184-54394a257a94", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -967,7 +990,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 25, | ||||
|    "execution_count": 26, | ||||
|    "id": "48967a77-7d17-42bf-9e92-fc619d63a59e", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -988,7 +1011,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 26, | ||||
|    "execution_count": 27, | ||||
|    "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -998,7 +1021,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 27, | ||||
|    "execution_count": 28, | ||||
|    "id": "5ff2cd85-7cfb-4325-b390-219938589428", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1006,12 +1029,15 @@ | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]\n" | ||||
|       "[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "text = \"Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.\"\n", | ||||
|     "text = (\n", | ||||
|     "    \"Hello, do you like tea? <|endoftext|> In the sunlit terraces\"\n", | ||||
|     "     \"of someunknownPlace.\"\n", | ||||
|     ")\n", | ||||
|     "\n", | ||||
|     "integers = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n", | ||||
|     "\n", | ||||
| @ -1020,7 +1046,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 28, | ||||
|    "execution_count": 29, | ||||
|    "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1080,7 +1106,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 29, | ||||
|    "execution_count": 30, | ||||
|    "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1111,7 +1137,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 30, | ||||
|    "execution_count": 31, | ||||
|    "id": "e84424a7-646d-45b6-99e3-80d15fb761f2", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1121,7 +1147,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 31, | ||||
|    "execution_count": 32, | ||||
|    "id": "dfbff852-a92f-48c8-a46d-143a0f109f40", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1154,7 +1180,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 32, | ||||
|    "execution_count": 33, | ||||
|    "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1179,7 +1205,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 33, | ||||
|    "execution_count": 34, | ||||
|    "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1221,7 +1247,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 34, | ||||
|    "execution_count": 35, | ||||
|    "id": "e1770134-e7f3-4725-a679-e04c3be48cac", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1258,7 +1284,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 35, | ||||
|    "execution_count": 36, | ||||
|    "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1290,12 +1316,14 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 36, | ||||
|    "execution_count": 37, | ||||
|    "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):\n", | ||||
|     "def create_dataloader_v1(txt, batch_size=4, max_length=256, \n", | ||||
|     "                         stride=128, shuffle=True, drop_last=True,\n", | ||||
|     "                         num_workers=0):\n", | ||||
|     "\n", | ||||
|     "    # Initialize the tokenizer\n", | ||||
|     "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n", | ||||
| @ -1325,7 +1353,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 37, | ||||
|    "execution_count": 38, | ||||
|    "id": "df31d96c-6bfd-4564-a956-6192242d7579", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1336,7 +1364,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 38, | ||||
|    "execution_count": 39, | ||||
|    "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1349,7 +1377,9 @@ | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)\n", | ||||
|     "dataloader = create_dataloader_v1(\n", | ||||
|     "    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False\n", | ||||
|     ")\n", | ||||
|     "\n", | ||||
|     "data_iter = iter(dataloader)\n", | ||||
|     "first_batch = next(data_iter)\n", | ||||
| @ -1358,7 +1388,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 39, | ||||
|    "execution_count": 40, | ||||
|    "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1402,7 +1432,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 40, | ||||
|    "execution_count": 41, | ||||
|    "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1477,7 +1507,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 41, | ||||
|    "execution_count": 42, | ||||
|    "id": "15a6304c-9474-4470-b85d-3991a49fa653", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1495,7 +1525,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 42, | ||||
|    "execution_count": 43, | ||||
|    "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1517,7 +1547,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 43, | ||||
|    "execution_count": 44, | ||||
|    "id": "a686eb61-e737-4351-8f1c-222913d47468", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1558,7 +1588,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 44, | ||||
|    "execution_count": 45, | ||||
|    "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1585,7 +1615,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 45, | ||||
|    "execution_count": 46, | ||||
|    "id": "50280ead-0363-44c8-8c35-bb885d92c8b7", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1679,7 +1709,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 46, | ||||
|    "execution_count": 47, | ||||
|    "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1701,20 +1731,23 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 47, | ||||
|    "execution_count": 48, | ||||
|    "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "max_length = 4\n", | ||||
|     "dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)\n", | ||||
|     "dataloader = create_dataloader_v1(\n", | ||||
|     "    raw_text, batch_size=8, max_length=max_length,\n", | ||||
|     "    stride=max_length, shuffle=False\n", | ||||
|     ")\n", | ||||
|     "data_iter = iter(dataloader)\n", | ||||
|     "inputs, targets = next(data_iter)" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 48, | ||||
|    "execution_count": 49, | ||||
|    "id": "84416b60-3707-4370-bcbc-da0b62f2b64d", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1744,7 +1777,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 49, | ||||
|    "execution_count": 50, | ||||
|    "id": "7766ec38-30d0-4128-8c31-f49f063c43d1", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1771,7 +1804,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 50, | ||||
|    "execution_count": 51, | ||||
|    "id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07", | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
| @ -1782,7 +1815,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 51, | ||||
|    "execution_count": 52, | ||||
|    "id": "c369a1e7-d566-4b53-b398-d6adafb44105", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
| @ -1809,7 +1842,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 52, | ||||
|    "execution_count": 53, | ||||
|    "id": "b22fab89-526e-43c8-9035-5b7018e34288", | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 rasbt
						rasbt