mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-12-25 06:02:07 +00:00
add colon and semicolon to tokenizer
This commit is contained in:
parent
5d02559993
commit
001507481e
@ -37,7 +37,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch version: 2.1.0\n",
|
||||
"torch version: 2.2.1\n",
|
||||
"tiktoken version: 0.5.1\n"
|
||||
]
|
||||
}
|
||||
@ -273,7 +273,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "902f0d9c-9828-4c46-ba32-8fe810c3840a",
|
||||
"id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -287,7 +287,7 @@
|
||||
"source": [
|
||||
"text = \"Hello, world. Is this-- a test?\"\n",
|
||||
"\n",
|
||||
"result = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n",
|
||||
"result = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
|
||||
"result = [item.strip() for item in result if item.strip()]\n",
|
||||
"print(result)"
|
||||
]
|
||||
@ -750,7 +750,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": 17,
|
||||
"id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -766,7 +766,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 18,
|
||||
"id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -776,7 +776,7 @@
|
||||
"1161"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -787,7 +787,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 19,
|
||||
"id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -818,7 +818,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 20,
|
||||
"id": "948861c5-3f30-4712-a234-725f20d26f68",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -854,7 +854,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 21,
|
||||
"id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -879,7 +879,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 22,
|
||||
"id": "ddfe7346-398d-4bf8-99f1-5b071244ce95",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -904,7 +904,7 @@
|
||||
" 7]"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -915,7 +915,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 23,
|
||||
"id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -925,7 +925,7 @@
|
||||
"'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1876,7 +1876,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user