mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-01 13:22:57 +00:00
commit
a19305fcda
@ -43,14 +43,16 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
|
"Total number of character: 20479\n",
|
||||||
"I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no \n"
|
"I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no \n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
|
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||||
" raw_text = f.read()\n",
|
" raw_text = f.read()\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
"print(\"Total number of character:\", len(raw_text))\n",
|
||||||
"print(raw_text[:99])"
|
"print(raw_text[:99])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -61,7 +63,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"- The goal is to tokenize and embed this text for an LLM\n",
|
"- The goal is to tokenize and embed this text for an LLM\n",
|
||||||
"- Let's develop a simple tokenizer based on some simple sample text that we can then later apply to the text above\n",
|
"- Let's develop a simple tokenizer based on some simple sample text that we can then later apply to the text above\n",
|
||||||
"- The following regular expressiin will split on a comma"
|
"- The following regular expression will split on whitespaces"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -74,15 +76,15 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"['Hello', ',', ' world', ',', ' this', ',', ' is', ',', ' a test.']\n"
|
"['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import re\n",
|
"import re\n",
|
||||||
"\n",
|
"\n",
|
||||||
"text = \"Hello, world, this, is, a test.\"\n",
|
"text = \"Hello, world. This, is a test.\"\n",
|
||||||
"result = re.split(r'(,)', text)\n",
|
"result = re.split(r'(\\s)', text)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(result)"
|
"print(result)"
|
||||||
]
|
]
|
||||||
@ -92,7 +94,7 @@
|
|||||||
"id": "a8c40c18-a9d5-4703-bf71-8261dbcc5ee3",
|
"id": "a8c40c18-a9d5-4703-bf71-8261dbcc5ee3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"- We don't only want to split on commas but also whitespaces, so let's modify the regular expression to do that as well"
|
"- We don't only want to split on whitespaces but also commas and periods, so let's modify the regular expression to do that as well"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -105,14 +107,12 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"['Hello', ',', '', ' ', 'world.', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test.']\n"
|
"['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"text = \"Hello, world. This, is a test.\"\n",
|
"result = re.split(r'([,.]|\\s)', text)\n",
|
||||||
"\n",
|
|
||||||
"result = re.split(r'([,]|\\s)', text)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"print(result)"
|
"print(result)"
|
||||||
]
|
]
|
||||||
@ -135,7 +135,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"['Hello', ',', 'world.', 'This', ',', 'is', 'a', 'test.']\n"
|
"['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -180,7 +180,7 @@
|
|||||||
"id": "5bbea70b-c030-45d9-b09d-4318164c0bb4",
|
"id": "5bbea70b-c030-45d9-b09d-4318164c0bb4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"- This is pretty good, and we are now readdy to apply this tokenization to the raw text"
|
"- This is pretty good, and we are now ready to apply this tokenization to the raw text"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -229,6 +229,14 @@
|
|||||||
"print(len(preprocessed))"
|
"print(len(preprocessed))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0b5ce8fe-3a07-4f2a-90f1-a0321ce3a231",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2.3 Converting tokens into token IDs"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b5973794-7002-4202-8b12-0900cd779720",
|
"id": "b5973794-7002-4202-8b12-0900cd779720",
|
||||||
@ -336,15 +344,14 @@
|
|||||||
"('Had', 47)\n",
|
"('Had', 47)\n",
|
||||||
"('Hang', 48)\n",
|
"('Hang', 48)\n",
|
||||||
"('Has', 49)\n",
|
"('Has', 49)\n",
|
||||||
"('He', 50)\n",
|
"('He', 50)\n"
|
||||||
"('Her', 51)\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"for i, item in enumerate(vocab.items()):\n",
|
"for i, item in enumerate(vocab.items()):\n",
|
||||||
" print(item)\n",
|
" print(item)\n",
|
||||||
" if i > 50:\n",
|
" if i >= 50:\n",
|
||||||
" break"
|
" break"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -454,7 +461,7 @@
|
|||||||
"id": "4b821ef8-4d53-43b6-a2b2-aef808c343c7",
|
"id": "4b821ef8-4d53-43b6-a2b2-aef808c343c7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## 2.3 Adding special context tokens"
|
"## 2.4 Adding special context tokens"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -678,7 +685,7 @@
|
|||||||
"id": "5c4ba34b-170f-4e71-939b-77aabb776f14",
|
"id": "5c4ba34b-170f-4e71-939b-77aabb776f14",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## 2.4 BytePair encoding"
|
"## 2.5 BytePair encoding"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -782,7 +789,7 @@
|
|||||||
"id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
|
"id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## 2.5 Data sampling with a sliding window"
|
"## 2.6 Data sampling with a sliding window"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1119,7 +1126,7 @@
|
|||||||
"id": "2cd2fcda-2fda-4aa8-8bc8-de1e496f9db1",
|
"id": "2cd2fcda-2fda-4aa8-8bc8-de1e496f9db1",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## 2.6 Creating token embeddings"
|
"## 2.7 Creating token embeddings"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1304,7 +1311,7 @@
|
|||||||
"id": "c393d270-b950-4bc8-99ea-97d74f2ea0f6",
|
"id": "c393d270-b950-4bc8-99ea-97d74f2ea0f6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## 2.7 Encoding word positions"
|
"## 2.8 Encoding word positions"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1458,7 +1465,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.4"
|
"version": "3.10.6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user