mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-28 18:40:01 +00:00
BPE: fixed typo (#492)
* fixed typo * use rel path if exists * mod gitignore and use existing vocab files --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
parent
b17d097742
commit
3f9facbc55
7
.gitignore
vendored
7
.gitignore
vendored
@ -101,6 +101,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json
|
||||
ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
|
||||
ch07/04_preference-tuning-with-dpo/loss-plot.pdf
|
||||
|
||||
# Tokenizer files
|
||||
ch02/05_bpe-from-scratch/bpe_merges.txt
|
||||
ch02/05_bpe-from-scratch/encoder.json
|
||||
ch02/05_bpe-from-scratch/vocab.bpe
|
||||
ch02/05_bpe-from-scratch/vocab.json
|
||||
|
||||
|
||||
# Other
|
||||
ch0?/0?_user_interface/.chainlit/
|
||||
ch0?/0?_user_interface/chainlit.md
|
||||
|
@ -722,14 +722,14 @@
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"if not os.path.exists(\"the-verdict.txt\"):\n",
|
||||
"if not os.path.exists(\"../01_main-chapter-code/the-verdict.txt\"):\n",
|
||||
" url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
|
||||
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
||||
" \"the-verdict.txt\")\n",
|
||||
" file_path = \"the-verdict.txt\"\n",
|
||||
" file_path = \"../01_main-chapter-code/the-verdict.txt\"\n",
|
||||
" urllib.request.urlretrieve(url, file_path)\n",
|
||||
"\n",
|
||||
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
|
||||
" text = f.read()"
|
||||
]
|
||||
},
|
||||
@ -876,7 +876,7 @@
|
||||
"id": "252693ee-e806-4dac-ab76-2c69086360f4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:"
|
||||
"- Note that the vocabulary itself is used in the `decode()` method, which allows us to map the token IDs back into text:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1099,24 +1099,34 @@
|
||||
"import os\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"def download_file_if_absent(url, filename):\n",
|
||||
" if not os.path.exists(filename):\n",
|
||||
"def download_file_if_absent(url, filename, search_dirs):\n",
|
||||
" for directory in search_dirs:\n",
|
||||
" file_path = os.path.join(directory, filename)\n",
|
||||
" if os.path.exists(file_path):\n",
|
||||
" print(f\"{filename} already exists in {file_path}\")\n",
|
||||
" return file_path\n",
|
||||
"\n",
|
||||
" target_path = os.path.join(search_dirs[0], filename)\n",
|
||||
" try:\n",
|
||||
" with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n",
|
||||
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
|
||||
" out_file.write(response.read())\n",
|
||||
" print(f\"Downloaded {filename}\")\n",
|
||||
" print(f\"Downloaded {filename} to {target_path}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Failed to download {filename}. Error: {e}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"{filename} already exists\")\n",
|
||||
" return target_path\n",
|
||||
"\n",
|
||||
"# Define the directories to search and the files to download\n",
|
||||
"search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
|
||||
"\n",
|
||||
"files_to_download = {\n",
|
||||
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
|
||||
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Ensure directories exist and download files if needed\n",
|
||||
"paths = {}\n",
|
||||
"for url, filename in files_to_download.items():\n",
|
||||
" download_file_if_absent(url, filename)"
|
||||
" paths[filename] = download_file_if_absent(url, filename, search_directories)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1136,7 +1146,7 @@
|
||||
"source": [
|
||||
"tokenizer_gpt2 = BPETokenizerSimple()\n",
|
||||
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
|
||||
" vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n",
|
||||
" vocab_path=paths[\"encoder.json\"], bpe_merges_path=paths[\"vocab.bpe\"]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
Loading…
x
Reference in New Issue
Block a user