mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-06-26 23:50:03 +00:00
Use more recent sentencepiece tokenizer API (#696)
This commit is contained in:
parent
bcfdbd7008
commit
01be5a42e4
@ -83,9 +83,9 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"huggingface_hub version: 0.24.7\n",
|
||||
"huggingface_hub version: 0.33.0\n",
|
||||
"sentencepiece version: 0.2.0\n",
|
||||
"torch version: 2.4.1+cu121\n"
|
||||
"torch version: 2.6.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -1097,18 +1097,7 @@
|
||||
"id": "3357a230-b678-4691-a238-257ee4e80185",
|
||||
"outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
|
||||
"Token is valid (permission: read).\n",
|
||||
"Your token has been saved to /root/.cache/huggingface/token\n",
|
||||
"Login successful\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from huggingface_hub import login\n",
|
||||
"import json\n",
|
||||
@ -1155,34 +1144,7 @@
|
||||
"id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
|
||||
"outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
|
||||
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
|
||||
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
|
||||
"You will be able to reuse this secret in all of your notebooks.\n",
|
||||
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "e6c75a6aa7b942fe84160e286e3acb3d",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from huggingface_hub import hf_hub_download\n",
|
||||
"\n",
|
||||
@ -1222,10 +1184,10 @@
|
||||
" self.tokenizer = sp\n",
|
||||
"\n",
|
||||
" def encode(self, text):\n",
|
||||
" return self.tokenizer.encode_as_ids(text)\n",
|
||||
" return self.tokenizer.encode(text, out_type=int)\n",
|
||||
"\n",
|
||||
" def decode(self, ids):\n",
|
||||
" return self.tokenizer.decode_pieces(ids)\n",
|
||||
" return self.tokenizer.decode(ids)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tokenizer = LlamaTokenizer(tokenizer_file)"
|
||||
@ -1258,7 +1220,7 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Output text:\n",
|
||||
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n"
|
||||
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
Loading…
x
Reference in New Issue
Block a user