mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-31 09:50:23 +00:00 
			
		
		
		
	Use more recent sentencepiece tokenizer API (#696)
This commit is contained in:
		
							parent
							
								
									bcfdbd7008
								
							
						
					
					
						commit
						01be5a42e4
					
				| @ -83,9 +83,9 @@ | |||||||
|      "name": "stdout", |      "name": "stdout", | ||||||
|      "output_type": "stream", |      "output_type": "stream", | ||||||
|      "text": [ |      "text": [ | ||||||
|       "huggingface_hub version: 0.24.7\n", |       "huggingface_hub version: 0.33.0\n", | ||||||
|       "sentencepiece version: 0.2.0\n", |       "sentencepiece version: 0.2.0\n", | ||||||
|       "torch version: 2.4.1+cu121\n" |       "torch version: 2.6.0\n" | ||||||
|      ] |      ] | ||||||
|     } |     } | ||||||
|    ], |    ], | ||||||
| @ -1097,18 +1097,7 @@ | |||||||
|     "id": "3357a230-b678-4691-a238-257ee4e80185", |     "id": "3357a230-b678-4691-a238-257ee4e80185", | ||||||
|     "outputId": "768ed6af-ce14-40bc-ca18-117b4b448269" |     "outputId": "768ed6af-ce14-40bc-ca18-117b4b448269" | ||||||
|    }, |    }, | ||||||
|    "outputs": [ |    "outputs": [], | ||||||
|     { |  | ||||||
|      "name": "stdout", |  | ||||||
|      "output_type": "stream", |  | ||||||
|      "text": [ |  | ||||||
|       "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n", |  | ||||||
|       "Token is valid (permission: read).\n", |  | ||||||
|       "Your token has been saved to /root/.cache/huggingface/token\n", |  | ||||||
|       "Login successful\n" |  | ||||||
|      ] |  | ||||||
|     } |  | ||||||
|    ], |  | ||||||
|    "source": [ |    "source": [ | ||||||
|     "from huggingface_hub import login\n", |     "from huggingface_hub import login\n", | ||||||
|     "import json\n", |     "import json\n", | ||||||
| @ -1155,34 +1144,7 @@ | |||||||
|     "id": "69714ea8-b9b8-4687-8392-f3abb8f93a32", |     "id": "69714ea8-b9b8-4687-8392-f3abb8f93a32", | ||||||
|     "outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01" |     "outputId": "c230fec9-5c71-4a41-90ab-8a34d114ea01" | ||||||
|    }, |    }, | ||||||
|    "outputs": [ |    "outputs": [], | ||||||
|     { |  | ||||||
|      "name": "stderr", |  | ||||||
|      "output_type": "stream", |  | ||||||
|      "text": [ |  | ||||||
|       "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", |  | ||||||
|       "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", |  | ||||||
|       "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", |  | ||||||
|       "You will be able to reuse this secret in all of your notebooks.\n", |  | ||||||
|       "Please note that authentication is recommended but still optional to access public models or datasets.\n", |  | ||||||
|       "  warnings.warn(\n" |  | ||||||
|      ] |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|      "data": { |  | ||||||
|       "application/vnd.jupyter.widget-view+json": { |  | ||||||
|        "model_id": "e6c75a6aa7b942fe84160e286e3acb3d", |  | ||||||
|        "version_major": 2, |  | ||||||
|        "version_minor": 0 |  | ||||||
|       }, |  | ||||||
|       "text/plain": [ |  | ||||||
|        "tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]" |  | ||||||
|       ] |  | ||||||
|      }, |  | ||||||
|      "metadata": {}, |  | ||||||
|      "output_type": "display_data" |  | ||||||
|     } |  | ||||||
|    ], |  | ||||||
|    "source": [ |    "source": [ | ||||||
|     "from huggingface_hub import hf_hub_download\n", |     "from huggingface_hub import hf_hub_download\n", | ||||||
|     "\n", |     "\n", | ||||||
| @ -1222,10 +1184,10 @@ | |||||||
|     "        self.tokenizer = sp\n", |     "        self.tokenizer = sp\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "    def encode(self, text):\n", |     "    def encode(self, text):\n", | ||||||
|     "        return self.tokenizer.encode_as_ids(text)\n", |     "        return self.tokenizer.encode(text, out_type=int)\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "    def decode(self, ids):\n", |     "    def decode(self, ids):\n", | ||||||
|     "        return self.tokenizer.decode_pieces(ids)\n", |     "        return self.tokenizer.decode(ids)\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "\n", |     "\n", | ||||||
|     "tokenizer = LlamaTokenizer(tokenizer_file)" |     "tokenizer = LlamaTokenizer(tokenizer_file)" | ||||||
| @ -1258,7 +1220,7 @@ | |||||||
|      "output_type": "stream", |      "output_type": "stream", | ||||||
|      "text": [ |      "text": [ | ||||||
|       "Output text:\n", |       "Output text:\n", | ||||||
|       " Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéon로жа Bass differencespadxsnu ;; ctx始\n" |       " Every effort movesαllRadius deletingpretcc否']; future eer napulate lackус während inter DES издаSchéonkkarto Оryptato#{ningproof eerbye\n" | ||||||
|      ] |      ] | ||||||
|     } |     } | ||||||
|    ], |    ], | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Sebastian Raschka
						Sebastian Raschka