Use more recent sentencepiece tokenizer API (#696)

This commit is contained in:
Sebastian Raschka 2025-06-22 13:52:30 -05:00 committed by GitHub
parent bcfdbd7008
commit 01be5a42e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -83,9 +83,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"huggingface_hub version: 0.24.7\n",
"huggingface_hub version: 0.33.0\n",
"sentencepiece version: 0.2.0\n",
"torch version: 2.4.1+cu121\n"
"torch version: 2.6.0\n"
]
}
],
@ -1097,18 +1097,7 @@
"id": "3357a230-b678-4691-a238-257ee4e80185",
"outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
"Token is valid (permission: read).\n",
"Your token has been saved to /root/.cache/huggingface/token\n",
"Login successful\n"
]
}
],
"outputs": [],
"source": [
"from huggingface_hub import login\n",
"import json\n",
@ -1155,34 +1144,7 @@
"id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
"outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
"You will be able to reuse this secret in all of your notebooks.\n",
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
" warnings.warn(\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e6c75a6aa7b942fe84160e286e3acb3d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"from huggingface_hub import hf_hub_download\n",
"\n",
@ -1222,10 +1184,10 @@
" self.tokenizer = sp\n",
"\n",
" def encode(self, text):\n",
" return self.tokenizer.encode_as_ids(text)\n",
" return self.tokenizer.encode(text, out_type=int)\n",
"\n",
" def decode(self, ids):\n",
" return self.tokenizer.decode_pieces(ids)\n",
" return self.tokenizer.decode(ids)\n",
"\n",
"\n",
"tokenizer = LlamaTokenizer(tokenizer_file)"
@ -1258,7 +1220,7 @@
"output_type": "stream",
"text": [
"Output text:\n",
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n"
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n"
]
}
],