Use more recent sentencepiece tokenizer API (#696)

2025-10-31 09:50:23 +00:00 · 2025-06-22 13:52:30 -05:00 · 2025-06-22 13:52:30 -05:00 · 01be5a42e4
commit 01be5a42e4
parent bcfdbd7008
1 changed files with 7 additions and 45 deletions
--- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
+++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb
@ -83,9 +83,9 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "huggingface_hub version: 0.24.7\n",
+      "huggingface_hub version: 0.33.0\n",
      "sentencepiece version: 0.2.0\n",
-      "torch version: 2.4.1+cu121\n"
+      "torch version: 2.6.0\n"
     ]
    }
   ],
@ -1097,18 +1097,7 @@
    "id": "3357a230-b678-4691-a238-257ee4e80185",
    "outputId": "768ed6af-ce14-40bc-ca18-117b4b448269"
   },
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
      "Token is valid (permission: read).\n",
      "Your token has been saved to /root/.cache/huggingface/token\n",
      "Login successful\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import login\n",
    "import json\n",
@ -1155,34 +1144,7 @@
    "id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
    "outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01"
   },
-   "outputs": [
+   "outputs": [],
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
      "You will be able to reuse this secret in all of your notebooks.\n",
      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e6c75a6aa7b942fe84160e286e3acb3d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import hf_hub_download\n",
    "\n",
@ -1222,10 +1184,10 @@
    "        self.tokenizer = sp\n",
    "\n",
    "    def encode(self, text):\n",
-    "        return self.tokenizer.encode_as_ids(text)\n",
+    "        return self.tokenizer.encode(text, out_type=int)\n",
    "\n",
    "    def decode(self, ids):\n",
-    "        return self.tokenizer.decode_pieces(ids)\n",
+    "        return self.tokenizer.decode(ids)\n",
    "\n",
    "\n",
    "tokenizer = LlamaTokenizer(tokenizer_file)"
@ -1258,7 +1220,7 @@
     "output_type": "stream",
     "text": [
      "Output text:\n",
-      " Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n"
+      " Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n"
     ]
    }
   ],