mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-07-03 07:04:25 +00:00
Use more recent sentencepiece tokenizer API (#696)
This commit is contained in:
parent
bcfdbd7008
commit
01be5a42e4
@ -83,9 +83,9 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"huggingface_hub version: 0.24.7\n",
|
"huggingface_hub version: 0.33.0\n",
|
||||||
"sentencepiece version: 0.2.0\n",
|
"sentencepiece version: 0.2.0\n",
|
||||||
"torch version: 2.4.1+cu121\n"
|
"torch version: 2.6.0\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1097,18 +1097,7 @@
|
|||||||
"id": "3357a230-b678-4691-a238-257ee4e80185",
|
"id": "3357a230-b678-4691-a238-257ee4e80185",
|
||||||
"outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
|
"outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
|
|
||||||
"Token is valid (permission: read).\n",
|
|
||||||
"Your token has been saved to /root/.cache/huggingface/token\n",
|
|
||||||
"Login successful\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from huggingface_hub import login\n",
|
"from huggingface_hub import login\n",
|
||||||
"import json\n",
|
"import json\n",
|
||||||
@ -1155,34 +1144,7 @@
|
|||||||
"id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
|
"id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
|
||||||
"outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
|
"outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
|
||||||
},
|
},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
|
|
||||||
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
|
|
||||||
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
|
|
||||||
"You will be able to reuse this secret in all of your notebooks.\n",
|
|
||||||
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
|
|
||||||
" warnings.warn(\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "e6c75a6aa7b942fe84160e286e3acb3d",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
"tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from huggingface_hub import hf_hub_download\n",
|
"from huggingface_hub import hf_hub_download\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -1222,10 +1184,10 @@
|
|||||||
" self.tokenizer = sp\n",
|
" self.tokenizer = sp\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def encode(self, text):\n",
|
" def encode(self, text):\n",
|
||||||
" return self.tokenizer.encode_as_ids(text)\n",
|
" return self.tokenizer.encode(text, out_type=int)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" def decode(self, ids):\n",
|
" def decode(self, ids):\n",
|
||||||
" return self.tokenizer.decode_pieces(ids)\n",
|
" return self.tokenizer.decode(ids)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"tokenizer = LlamaTokenizer(tokenizer_file)"
|
"tokenizer = LlamaTokenizer(tokenizer_file)"
|
||||||
@ -1258,7 +1220,7 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Output text:\n",
|
"Output text:\n",
|
||||||
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n"
|
" Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user