mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-10-30 01:10:33 +00:00
fixing the regular expression used in the SimpleTokenizer (#259)
* fixing the regular expression used in the SimpleTokenizer class and a typo in the 2.7 Creating token embedding introduction section * rerun --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
parent
3e47407646
commit
46f4d9e575
@ -560,7 +560,8 @@
|
||||
" self.int_to_str = {i:s for s,i in vocab.items()}\n",
|
||||
" \n",
|
||||
" def encode(self, text):\n",
|
||||
" preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n",
|
||||
" preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
|
||||
" \n",
|
||||
" preprocessed = [\n",
|
||||
" item.strip() for item in preprocessed if item.strip()\n",
|
||||
" ]\n",
|
||||
@ -754,8 +755,8 @@
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39mencode(text)\n",
|
||||
"Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
||||
"Cell \u001b[0;32mIn[13], line 11\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 9\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 10\u001b[0m ]\n\u001b[0;32m---> 11\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
||||
"Cell \u001b[0;32mIn[13], line 12\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.:;?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 9\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 10\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 11\u001b[0m ]\n\u001b[0;32m---> 12\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
||||
"Cell \u001b[0;32mIn[13], line 12\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.:;?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 9\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 10\u001b[0m item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 11\u001b[0m ]\n\u001b[0;32m---> 12\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
||||
"\u001b[0;31mKeyError\u001b[0m: 'Hello'"
|
||||
]
|
||||
}
|
||||
@ -856,7 +857,7 @@
|
||||
" self.int_to_str = { i:s for s,i in vocab.items()}\n",
|
||||
" \n",
|
||||
" def encode(self, text):\n",
|
||||
" preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n",
|
||||
" preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
|
||||
" preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
|
||||
" preprocessed = [\n",
|
||||
" item if item in self.str_to_int \n",
|
||||
@ -884,7 +885,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
|
||||
"id": "daaf8cf8-d1a3-4640-a7c8-76fc398e7e0c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -1493,7 +1494,7 @@
|
||||
"id": "44e014ca-1fc5-4b90-b6fa-c2097bb92c0b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- Suppose we have the following four input examples with input ids 5, 1, 3, and 2 (after tokenization):"
|
||||
"- Suppose we have the following four input examples with input ids 2, 3, 5, and 1 (after tokenization):"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user