From 7686d4569fcc52573ef0989b22df08af9739b4ba Mon Sep 17 00:00:00 2001 From: rasbt Date: Wed, 22 May 2024 20:15:31 -0500 Subject: [PATCH 1/2] update regex --- ch02/01_main-chapter-code/ch02.ipynb | 170 +++++++++++++-------------- 1 file changed, 85 insertions(+), 85 deletions(-) diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index bdfcd03..bff2c8d 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374", "metadata": {}, "outputs": [ @@ -328,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "8c567caa-8ff5-49a8-a5cc-d365b0a78a99", "metadata": {}, "outputs": [ @@ -341,7 +341,7 @@ } ], "source": [ - "preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n", + "preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', raw_text)\n", "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", "print(preprocessed[:30])" ] @@ -356,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "35db7b5e-510b-4c45-995f-f5ad64a8e19c", "metadata": {}, "outputs": [ @@ -364,7 +364,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "4649\n" + "4690\n" ] } ], @@ -406,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "7fdf0533-5ab6-42a5-83fa-a3b045de6396", "metadata": {}, "outputs": [ @@ -414,7 +414,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1159\n" + "1130\n" ] } ], @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "77d00d96-881f-4691-bb03-84fec2a75a26", "metadata": {}, "outputs": [], @@ -445,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "e1c5de4a-aa4e-4aec-b532-10bb364039d6", "metadata": {}, "outputs": [ @@ -478,32 +478,32 @@ "('But', 22)\n", "('By', 23)\n", "('Carlo', 24)\n", - "('Carlo;', 25)\n", - "('Chicago', 26)\n", - "('Claude', 27)\n", - "('Come', 28)\n", - "('Croft', 29)\n", - "('Destroyed', 30)\n", - "('Devonshire', 31)\n", - "('Don', 32)\n", - "('Dubarry', 33)\n", - "('Emperors', 34)\n", - "('Florence', 35)\n", - "('For', 36)\n", - "('Gallery', 37)\n", - "('Gideon', 38)\n", - "('Gisburn', 39)\n", - "('Gisburns', 40)\n", - "('Grafton', 41)\n", - "('Greek', 42)\n", - "('Grindle', 43)\n", - "('Grindle:', 44)\n", - "('Grindles', 45)\n", - "('HAD', 46)\n", - "('Had', 47)\n", - "('Hang', 48)\n", - "('Has', 49)\n", - "('He', 50)\n" + "('Chicago', 25)\n", + "('Claude', 26)\n", + "('Come', 27)\n", + "('Croft', 28)\n", + "('Destroyed', 29)\n", + "('Devonshire', 30)\n", + "('Don', 31)\n", + "('Dubarry', 32)\n", + "('Emperors', 33)\n", + "('Florence', 34)\n", + "('For', 35)\n", + "('Gallery', 36)\n", + "('Gideon', 37)\n", + "('Gisburn', 38)\n", + "('Gisburns', 39)\n", + "('Grafton', 40)\n", + "('Greek', 41)\n", + "('Grindle', 42)\n", + "('Grindles', 43)\n", + "('HAD', 44)\n", + "('Had', 45)\n", + "('Hang', 46)\n", + "('Has', 47)\n", + "('He', 48)\n", + "('Her', 49)\n", + "('Hermia', 50)\n" ] } ], @@ -540,7 +540,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "f531bf46-7c25-4ef8-bff8-0d27518676d5", "metadata": {}, "outputs": [], @@ -593,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "647364ec-7995-4654-9b4a-7607ccf5f1e4", "metadata": {}, "outputs": [ @@ -601,7 +601,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]\n" + "[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]\n" ] } ], @@ -624,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "01d8c8fb-432d-4a49-b332-99f23b233746", "metadata": {}, "outputs": [ @@ -634,7 +634,7 @@ "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -645,7 +645,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "54f6aa8b-9827-412e-9035-e827296ab0fe", "metadata": {}, "outputs": [ @@ -655,7 +655,7 @@ "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -733,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "d5767eff-440c-4de1-9289-f789349d6b85", "metadata": {}, "outputs": [ @@ -744,9 +744,9 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", - "Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[18], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[14], line 11\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[14], line 11\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", "\u001b[0;31mKeyError\u001b[0m: 'Hello'" ] } @@ -771,7 +771,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f", "metadata": {}, "outputs": [], @@ -787,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9", "metadata": {}, "outputs": [ @@ -797,7 +797,7 @@ "1161" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -808,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959", "metadata": {}, "outputs": [ @@ -839,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "948861c5-3f30-4712-a234-725f20d26f68", "metadata": {}, "outputs": [], @@ -877,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a", "metadata": {}, "outputs": [ @@ -902,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95", "metadata": {}, "outputs": [ @@ -927,7 +927,7 @@ " 7]" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -938,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b", "metadata": {}, "outputs": [ @@ -948,7 +948,7 @@ "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -980,7 +980,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "ede1d41f-934b-4bf4-8184-54394a257a94", "metadata": {}, "outputs": [], @@ -990,7 +990,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "id": "48967a77-7d17-42bf-9e92-fc619d63a59e", "metadata": {}, "outputs": [ @@ -1011,7 +1011,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128", "metadata": {}, "outputs": [], @@ -1021,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "5ff2cd85-7cfb-4325-b390-219938589428", "metadata": {}, "outputs": [ @@ -1046,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab", "metadata": {}, "outputs": [ @@ -1106,7 +1106,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b", "metadata": {}, "outputs": [ @@ -1137,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "e84424a7-646d-45b6-99e3-80d15fb761f2", "metadata": {}, "outputs": [], @@ -1147,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "dfbff852-a92f-48c8-a46d-143a0f109f40", "metadata": {}, "outputs": [ @@ -1180,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366", "metadata": {}, "outputs": [ @@ -1205,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1", "metadata": {}, "outputs": [ @@ -1247,7 +1247,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "id": "e1770134-e7f3-4725-a679-e04c3be48cac", "metadata": {}, "outputs": [ @@ -1284,7 +1284,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375", "metadata": {}, "outputs": [], @@ -1316,7 +1316,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e", "metadata": {}, "outputs": [], @@ -1353,7 +1353,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "id": "df31d96c-6bfd-4564-a956-6192242d7579", "metadata": {}, "outputs": [], @@ -1364,7 +1364,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f", "metadata": {}, "outputs": [ @@ -1388,7 +1388,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d", "metadata": {}, "outputs": [ @@ -1432,7 +1432,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c", "metadata": {}, "outputs": [ @@ -1507,7 +1507,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "id": "15a6304c-9474-4470-b85d-3991a49fa653", "metadata": {}, "outputs": [], @@ -1525,7 +1525,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda", "metadata": {}, "outputs": [], @@ -1547,7 +1547,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "id": "a686eb61-e737-4351-8f1c-222913d47468", "metadata": {}, "outputs": [ @@ -1588,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca", "metadata": {}, "outputs": [ @@ -1615,7 +1615,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "id": "50280ead-0363-44c8-8c35-bb885d92c8b7", "metadata": {}, "outputs": [ @@ -1709,7 +1709,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041", "metadata": {}, "outputs": [], @@ -1731,7 +1731,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3", "metadata": {}, "outputs": [], @@ -1747,7 +1747,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "id": "84416b60-3707-4370-bcbc-da0b62f2b64d", "metadata": {}, "outputs": [ @@ -1777,7 +1777,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "id": "7766ec38-30d0-4128-8c31-f49f063c43d1", "metadata": {}, "outputs": [ @@ -1804,7 +1804,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07", "metadata": {}, "outputs": [], @@ -1815,7 +1815,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 53, "id": "c369a1e7-d566-4b53-b398-d6adafb44105", "metadata": {}, "outputs": [ @@ -1842,7 +1842,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "id": "b22fab89-526e-43c8-9035-5b7018e34288", "metadata": {}, "outputs": [ From 5b1dcf0b33cafa2b9f50c595229685e5f6c684a3 Mon Sep 17 00:00:00 2001 From: rasbt Date: Wed, 22 May 2024 20:27:09 -0500 Subject: [PATCH 2/2] reset cell count for better nbdiff --- ch02/01_main-chapter-code/ch02.ipynb | 110 +++++++++++++-------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index bff2c8d..d91527b 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374", "metadata": {}, "outputs": [ @@ -328,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "8c567caa-8ff5-49a8-a5cc-d365b0a78a99", "metadata": {}, "outputs": [ @@ -356,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "35db7b5e-510b-4c45-995f-f5ad64a8e19c", "metadata": {}, "outputs": [ @@ -406,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "7fdf0533-5ab6-42a5-83fa-a3b045de6396", "metadata": {}, "outputs": [ @@ -427,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "77d00d96-881f-4691-bb03-84fec2a75a26", "metadata": {}, "outputs": [], @@ -445,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "e1c5de4a-aa4e-4aec-b532-10bb364039d6", "metadata": {}, "outputs": [ @@ -540,7 +540,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "f531bf46-7c25-4ef8-bff8-0d27518676d5", "metadata": {}, "outputs": [], @@ -593,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "647364ec-7995-4654-9b4a-7607ccf5f1e4", "metadata": {}, "outputs": [ @@ -624,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "01d8c8fb-432d-4a49-b332-99f23b233746", "metadata": {}, "outputs": [ @@ -634,7 +634,7 @@ "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -645,7 +645,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "54f6aa8b-9827-412e-9035-e827296ab0fe", "metadata": {}, "outputs": [ @@ -655,7 +655,7 @@ "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -733,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "d5767eff-440c-4de1-9289-f789349d6b85", "metadata": {}, "outputs": [ @@ -744,9 +744,9 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[14], line 11\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", - "Cell \u001b[0;32mIn[14], line 11\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", "\u001b[0;31mKeyError\u001b[0m: 'Hello'" ] } @@ -771,7 +771,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f", "metadata": {}, "outputs": [], @@ -787,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9", "metadata": {}, "outputs": [ @@ -797,7 +797,7 @@ "1161" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -808,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959", "metadata": {}, "outputs": [ @@ -839,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "948861c5-3f30-4712-a234-725f20d26f68", "metadata": {}, "outputs": [], @@ -877,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a", "metadata": {}, "outputs": [ @@ -902,7 +902,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95", "metadata": {}, "outputs": [ @@ -927,7 +927,7 @@ " 7]" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -938,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b", "metadata": {}, "outputs": [ @@ -948,7 +948,7 @@ "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -980,7 +980,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "ede1d41f-934b-4bf4-8184-54394a257a94", "metadata": {}, "outputs": [], @@ -990,7 +990,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "id": "48967a77-7d17-42bf-9e92-fc619d63a59e", "metadata": {}, "outputs": [ @@ -1011,7 +1011,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128", "metadata": {}, "outputs": [], @@ -1021,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "id": "5ff2cd85-7cfb-4325-b390-219938589428", "metadata": {}, "outputs": [ @@ -1046,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab", "metadata": {}, "outputs": [ @@ -1106,7 +1106,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b", "metadata": {}, "outputs": [ @@ -1137,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "id": "e84424a7-646d-45b6-99e3-80d15fb761f2", "metadata": {}, "outputs": [], @@ -1147,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "id": "dfbff852-a92f-48c8-a46d-143a0f109f40", "metadata": {}, "outputs": [ @@ -1180,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366", "metadata": {}, "outputs": [ @@ -1205,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1", "metadata": {}, "outputs": [ @@ -1247,7 +1247,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "id": "e1770134-e7f3-4725-a679-e04c3be48cac", "metadata": {}, "outputs": [ @@ -1284,7 +1284,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375", "metadata": {}, "outputs": [], @@ -1316,7 +1316,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e", "metadata": {}, "outputs": [], @@ -1353,7 +1353,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "id": "df31d96c-6bfd-4564-a956-6192242d7579", "metadata": {}, "outputs": [], @@ -1364,7 +1364,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f", "metadata": {}, "outputs": [ @@ -1388,7 +1388,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 40, "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d", "metadata": {}, "outputs": [ @@ -1432,7 +1432,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 41, "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c", "metadata": {}, "outputs": [ @@ -1507,7 +1507,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 42, "id": "15a6304c-9474-4470-b85d-3991a49fa653", "metadata": {}, "outputs": [], @@ -1525,7 +1525,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 43, "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda", "metadata": {}, "outputs": [], @@ -1547,7 +1547,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 44, "id": "a686eb61-e737-4351-8f1c-222913d47468", "metadata": {}, "outputs": [ @@ -1588,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 45, "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca", "metadata": {}, "outputs": [ @@ -1615,7 +1615,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 46, "id": "50280ead-0363-44c8-8c35-bb885d92c8b7", "metadata": {}, "outputs": [ @@ -1709,7 +1709,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 47, "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041", "metadata": {}, "outputs": [], @@ -1731,7 +1731,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 48, "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3", "metadata": {}, "outputs": [], @@ -1747,7 +1747,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 49, "id": "84416b60-3707-4370-bcbc-da0b62f2b64d", "metadata": {}, "outputs": [ @@ -1777,7 +1777,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 50, "id": "7766ec38-30d0-4128-8c31-f49f063c43d1", "metadata": {}, "outputs": [ @@ -1804,7 +1804,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 51, "id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07", "metadata": {}, "outputs": [], @@ -1815,7 +1815,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 52, "id": "c369a1e7-d566-4b53-b398-d6adafb44105", "metadata": {}, "outputs": [ @@ -1842,7 +1842,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 53, "id": "b22fab89-526e-43c8-9035-5b7018e34288", "metadata": {}, "outputs": [