diff --git a/appendix-E/01_main-chapter-code/appendix-E.ipynb b/appendix-E/01_main-chapter-code/appendix-E.ipynb index 4d24891..70464c1 100644 --- a/appendix-E/01_main-chapter-code/appendix-E.ipynb +++ b/appendix-E/01_main-chapter-code/appendix-E.ipynb @@ -48,12 +48,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "matplotlib version: 3.7.2\n", - "numpy version: 1.25.2\n", - "tiktoken version: 0.5.1\n", - "torch version: 2.2.2\n", - "tensorflow version: 2.15.0\n", - "pandas version: 2.0.3\n" + "matplotlib version: 3.10.0\n", + "numpy version: 2.0.2\n", + "tiktoken version: 0.9.0\n", + "torch version: 2.6.0\n", + "tensorflow version: 2.18.0\n", + "pandas version: 2.2.3\n" ] } ], @@ -190,6 +190,7 @@ } ], "source": [ + "import urllib\n", "from pathlib import Path\n", "import pandas as pd\n", "from previous_chapters import (\n", @@ -204,7 +205,13 @@ "extracted_path = \"sms_spam_collection\"\n", "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n", "\n", - "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", + "try:\n", + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", + "except urllib.error.HTTPError:\n", + " print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n", + " \" temporary unavailable. Using backup URL.\")\n", + " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", "\n", "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n", "balanced_df = create_balanced_dataset(df)\n", diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb index 04721f5..9e14ce6 100644 --- a/ch06/01_main-chapter-code/ch06.ipynb +++ b/ch06/01_main-chapter-code/ch06.ipynb @@ -50,7 +50,7 @@ "text": [ "matplotlib version: 3.10.0\n", "numpy version: 2.0.2\n", - "tiktoken version: 0.8.0\n", + "tiktoken version: 0.9.0\n", "torch version: 2.6.0\n", "tensorflow version: 2.18.0\n", "pandas version: 2.2.3\n" @@ -167,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "def7c09b-af9c-4216-90ce-5e67aed1065c", "metadata": { "colab": { @@ -181,7 +181,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n" + "File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv\n" ] } ], @@ -215,7 +215,13 @@ " os.rename(original_file_path, data_file_path)\n", " print(f\"File downloaded and saved as {data_file_path}\")\n", "\n", - "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)" + "try:\n", + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", + "except urllib.error.HTTPError:\n", + " print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n", + " \" temporary unavailable. Using backup URL.\")\n", + " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", + " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) " ] }, { diff --git a/ch06/01_main-chapter-code/gpt_class_finetune.py b/ch06/01_main-chapter-code/gpt_class_finetune.py index b7a1b63..0f17d4b 100644 --- a/ch06/01_main-chapter-code/gpt_class_finetune.py +++ b/ch06/01_main-chapter-code/gpt_class_finetune.py @@ -276,7 +276,16 @@ if __name__ == "__main__": extracted_path = "sms_spam_collection" data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" - download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode) + try: + download_and_unzip_spam_data( + url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode + ) + except urllib.error.HTTPError: + backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" + download_and_unzip_spam_data( + backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode + ) + df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) balanced_df = create_balanced_dataset(df) balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py index eb1ee25..28c3c09 100644 --- a/ch06/02_bonus_additional-experiments/additional_experiments.py +++ b/ch06/02_bonus_additional-experiments/additional_experiments.py @@ -603,7 +603,11 @@ if __name__ == "__main__": all_exist = all((base_path / file_name).exists() for file_name in file_names) if not all_exist: - download_and_unzip(url, zip_path, extract_to, new_file_path) + try: + download_and_unzip(url, zip_path, extract_to, new_file_path) + except urllib.error.HTTPError: + backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" + download_and_unzip(backup_url, zip_path, extract_to, new_file_path) create_dataset_csvs(new_file_path) tokenizer = tiktoken.get_encoding("gpt2") diff --git a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py index 6ba5c8b..603fb1c 100644 --- a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py +++ b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py @@ -410,7 +410,11 @@ if __name__ == "__main__": all_exist = all((base_path / file_name).exists() for file_name in file_names) if not all_exist: - download_and_unzip(url, zip_path, extract_to, new_file_path) + try: + download_and_unzip(url, zip_path, extract_to, new_file_path) + except urllib.error.HTTPError: + backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" + download_and_unzip(backup_url, zip_path, extract_to, new_file_path) create_dataset_csvs(new_file_path) if args.use_attention_mask.lower() == "true":