Use more recent sentencepiece tokenizer API (#696)

2025-10-27 07:49:25 +00:00 · 2025-06-22 13:52:30 -05:00 · 2025-06-22 13:52:30 -05:00 · 01be5a42e4
commit 01be5a42e4
parent bcfdbd7008
1 changed files with 7 additions and 45 deletions
--- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
+++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
@ -83,9 +83,9 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "huggingface_hub version: 0.24.7\n",
+      "huggingface_hub version: 0.33.0\n",
      "sentencepiece version: 0.2.0\n",
-      "torch version: 2.4.1+cu121\n"
+      "torch version: 2.6.0\n"
     ]
    }
   ],
@ -1097,18 +1097,7 @@
    "id": "3357a230-b678-4691-a238-257ee4e80185",
    "outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
-      "Token is valid (permission: read).\n",
-      "Your token has been saved to /root/.cache/huggingface/token\n",
-      "Login successful\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from huggingface_hub import login\n",
    "import json\n",
@ -1155,34 +1144,7 @@
    "id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
    "outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
-      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
-      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
-      "You will be able to reuse this secret in all of your notebooks.\n",
-      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e6c75a6aa7b942fe84160e286e3acb3d",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from huggingface_hub import hf_hub_download\n",
    "\n",
@ -1222,10 +1184,10 @@
    "        self.tokenizer = sp\n",
    "\n",
    "    def encode(self, text):\n",
-    "        return self.tokenizer.encode_as_ids(text)\n",
+    "        return self.tokenizer.encode(text, out_type=int)\n",
    "\n",
    "    def decode(self, ids):\n",
-    "        return self.tokenizer.decode_pieces(ids)\n",
+    "        return self.tokenizer.decode(ids)\n",
    "\n",
    "\n",
    "tokenizer = LlamaTokenizer(tokenizer_file)"
@ -1258,7 +1220,7 @@
     "output_type": "stream",
     "text": [
      "Output text:\n",
-      " Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n"
+      " Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n"
     ]
    }
   ],