mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-10-24 14:29:19 +00:00
BPE: fixed typo (#492)
* fixed typo * use rel path if exists * mod gitignore and use existing vocab files --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
parent
b17d097742
commit
3f9facbc55
7
.gitignore
vendored
7
.gitignore
vendored
@ -101,6 +101,13 @@ ch07/02_dataset-utilities/instruction-examples-modified.json
|
|||||||
ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
|
ch07/04_preference-tuning-with-dpo/gpt2-medium355M-sft.pth
|
||||||
ch07/04_preference-tuning-with-dpo/loss-plot.pdf
|
ch07/04_preference-tuning-with-dpo/loss-plot.pdf
|
||||||
|
|
||||||
|
# Tokenizer files
|
||||||
|
ch02/05_bpe-from-scratch/bpe_merges.txt
|
||||||
|
ch02/05_bpe-from-scratch/encoder.json
|
||||||
|
ch02/05_bpe-from-scratch/vocab.bpe
|
||||||
|
ch02/05_bpe-from-scratch/vocab.json
|
||||||
|
|
||||||
|
|
||||||
# Other
|
# Other
|
||||||
ch0?/0?_user_interface/.chainlit/
|
ch0?/0?_user_interface/.chainlit/
|
||||||
ch0?/0?_user_interface/chainlit.md
|
ch0?/0?_user_interface/chainlit.md
|
||||||
|
@ -722,14 +722,14 @@
|
|||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import urllib.request\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not os.path.exists(\"the-verdict.txt\"):\n",
|
"if not os.path.exists(\"../01_main-chapter-code/the-verdict.txt\"):\n",
|
||||||
" url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
|
" url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
|
||||||
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
|
||||||
" \"the-verdict.txt\")\n",
|
" \"the-verdict.txt\")\n",
|
||||||
" file_path = \"the-verdict.txt\"\n",
|
" file_path = \"../01_main-chapter-code/the-verdict.txt\"\n",
|
||||||
" urllib.request.urlretrieve(url, file_path)\n",
|
" urllib.request.urlretrieve(url, file_path)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f: # added ../01_main-chapter-code/\n",
|
||||||
" text = f.read()"
|
" text = f.read()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -876,7 +876,7 @@
|
|||||||
"id": "252693ee-e806-4dac-ab76-2c69086360f4",
|
"id": "252693ee-e806-4dac-ab76-2c69086360f4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"- Note that the vocabulary itself is used in the `decoder()` method, which allows us to map the token IDs back into text:"
|
"- Note that the vocabulary itself is used in the `decode()` method, which allows us to map the token IDs back into text:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1099,24 +1099,34 @@
|
|||||||
"import os\n",
|
"import os\n",
|
||||||
"import urllib.request\n",
|
"import urllib.request\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def download_file_if_absent(url, filename):\n",
|
"def download_file_if_absent(url, filename, search_dirs):\n",
|
||||||
" if not os.path.exists(filename):\n",
|
" for directory in search_dirs:\n",
|
||||||
" try:\n",
|
" file_path = os.path.join(directory, filename)\n",
|
||||||
" with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:\n",
|
" if os.path.exists(file_path):\n",
|
||||||
" out_file.write(response.read())\n",
|
" print(f\"{filename} already exists in {file_path}\")\n",
|
||||||
" print(f\"Downloaded {filename}\")\n",
|
" return file_path\n",
|
||||||
" except Exception as e:\n",
|
"\n",
|
||||||
" print(f\"Failed to download {filename}. Error: {e}\")\n",
|
" target_path = os.path.join(search_dirs[0], filename)\n",
|
||||||
" else:\n",
|
" try:\n",
|
||||||
" print(f\"{filename} already exists\")\n",
|
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
|
||||||
|
" out_file.write(response.read())\n",
|
||||||
|
" print(f\"Downloaded {filename} to {target_path}\")\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"Failed to download {filename}. Error: {e}\")\n",
|
||||||
|
" return target_path\n",
|
||||||
|
"\n",
|
||||||
|
"# Define the directories to search and the files to download\n",
|
||||||
|
"search_directories = [\".\", \"../02_bonus_bytepair-encoder/gpt2_model/\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"files_to_download = {\n",
|
"files_to_download = {\n",
|
||||||
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
|
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe\": \"vocab.bpe\",\n",
|
||||||
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n",
|
" \"https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json\": \"encoder.json\"\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# Ensure directories exist and download files if needed\n",
|
||||||
|
"paths = {}\n",
|
||||||
"for url, filename in files_to_download.items():\n",
|
"for url, filename in files_to_download.items():\n",
|
||||||
" download_file_if_absent(url, filename)"
|
" paths[filename] = download_file_if_absent(url, filename, search_directories)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -1136,7 +1146,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"tokenizer_gpt2 = BPETokenizerSimple()\n",
|
"tokenizer_gpt2 = BPETokenizerSimple()\n",
|
||||||
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
|
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
|
||||||
" vocab_path=\"encoder.json\", bpe_merges_path=\"vocab.bpe\"\n",
|
" vocab_path=paths[\"encoder.json\"], bpe_merges_path=paths[\"vocab.bpe\"]\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user