diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 90d25c3..8cb54d7 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -43,14 +43,16 @@ "name": "stdout", "output_type": "stream", "text": [ + "Total number of character: 20479\n", "I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no \n" ] } ], "source": [ - "with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n", + "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", " raw_text = f.read()\n", " \n", + "print(\"Total number of character:\", len(raw_text))\n", "print(raw_text[:99])" ] }, @@ -61,7 +63,7 @@ "source": [ "- The goal is to tokenize and embed this text for an LLM\n", "- Let's develop a simple tokenizer based on some simple sample text that we can then later apply to the text above\n", - "- The following regular expressiin will split on a comma" + "- The following regular expression will split on whitespaces" ] }, { @@ -74,15 +76,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "['Hello', ',', ' world', ',', ' this', ',', ' is', ',', ' a test.']\n" + "['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']\n" ] } ], "source": [ "import re\n", "\n", - "text = \"Hello, world, this, is, a test.\"\n", - "result = re.split(r'(,)', text)\n", + "text = \"Hello, world. This, is a test.\"\n", + "result = re.split(r'(\\s)', text)\n", "\n", "print(result)" ] @@ -92,7 +94,7 @@ "id": "a8c40c18-a9d5-4703-bf71-8261dbcc5ee3", "metadata": {}, "source": [ - "- We don't only want to split on commas but also whitespaces, so let's modify the regular expression to do that as well" + "- We don't only want to split on whitespaces but also commas and periods, so let's modify the regular expression to do that as well" ] }, { @@ -105,14 +107,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "['Hello', ',', '', ' ', 'world.', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test.']\n" + "['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']\n" ] } ], "source": [ - "text = \"Hello, world. This, is a test.\"\n", - "\n", - "result = re.split(r'([,]|\\s)', text)\n", + "result = re.split(r'([,.]|\\s)', text)\n", "\n", "print(result)" ] @@ -135,7 +135,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['Hello', ',', 'world.', 'This', ',', 'is', 'a', 'test.']\n" + "['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']\n" ] } ], @@ -180,7 +180,7 @@ "id": "5bbea70b-c030-45d9-b09d-4318164c0bb4", "metadata": {}, "source": [ - "- This is pretty good, and we are now readdy to apply this tokenization to the raw text" + "- This is pretty good, and we are now ready to apply this tokenization to the raw text" ] }, { @@ -229,6 +229,14 @@ "print(len(preprocessed))" ] }, + { + "cell_type": "markdown", + "id": "0b5ce8fe-3a07-4f2a-90f1-a0321ce3a231", + "metadata": {}, + "source": [ + "## 2.3 Converting tokens into token IDs" + ] + }, { "cell_type": "markdown", "id": "b5973794-7002-4202-8b12-0900cd779720", @@ -336,15 +344,14 @@ "('Had', 47)\n", "('Hang', 48)\n", "('Has', 49)\n", - "('He', 50)\n", - "('Her', 51)\n" + "('He', 50)\n" ] } ], "source": [ "for i, item in enumerate(vocab.items()):\n", " print(item)\n", - " if i > 50:\n", + " if i >= 50:\n", " break" ] }, @@ -454,7 +461,7 @@ "id": "4b821ef8-4d53-43b6-a2b2-aef808c343c7", "metadata": {}, "source": [ - "## 2.3 Adding special context tokens" + "## 2.4 Adding special context tokens" ] }, { @@ -678,7 +685,7 @@ "id": "5c4ba34b-170f-4e71-939b-77aabb776f14", "metadata": {}, "source": [ - "## 2.4 BytePair encoding" + "## 2.5 BytePair encoding" ] }, { @@ -782,7 +789,7 @@ "id": "abbd7c0d-70f8-4386-a114-907e96c950b0", "metadata": {}, "source": [ - "## 2.5 Data sampling with a sliding window" + "## 2.6 Data sampling with a sliding window" ] }, { @@ -1119,7 +1126,7 @@ "id": "2cd2fcda-2fda-4aa8-8bc8-de1e496f9db1", "metadata": {}, "source": [ - "## 2.6 Creating token embeddings" + "## 2.7 Creating token embeddings" ] }, { @@ -1304,7 +1311,7 @@ "id": "c393d270-b950-4bc8-99ea-97d74f2ea0f6", "metadata": {}, "source": [ - "## 2.7 Encoding word positions" + "## 2.8 Encoding word positions" ] }, { @@ -1458,7 +1465,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" } }, "nbformat": 4,