mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-10-30 17:29:59 +00:00 
			
		
		
		
	Add backup url for Spam Dataset (#543)
* Add backup url for Spam Dataset * import urllib * fix url
This commit is contained in:
		
							parent
							
								
									2254102270
								
							
						
					
					
						commit
						eb6787397c
					
				| @ -48,12 +48,12 @@ | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "matplotlib version: 3.7.2\n", | ||||
|       "numpy version: 1.25.2\n", | ||||
|       "tiktoken version: 0.5.1\n", | ||||
|       "torch version: 2.2.2\n", | ||||
|       "tensorflow version: 2.15.0\n", | ||||
|       "pandas version: 2.0.3\n" | ||||
|       "matplotlib version: 3.10.0\n", | ||||
|       "numpy version: 2.0.2\n", | ||||
|       "tiktoken version: 0.9.0\n", | ||||
|       "torch version: 2.6.0\n", | ||||
|       "tensorflow version: 2.18.0\n", | ||||
|       "pandas version: 2.2.3\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
| @ -190,6 +190,7 @@ | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
|     "import urllib\n", | ||||
|     "from pathlib import Path\n", | ||||
|     "import pandas as pd\n", | ||||
|     "from previous_chapters import (\n", | ||||
| @ -204,7 +205,13 @@ | ||||
|     "extracted_path = \"sms_spam_collection\"\n", | ||||
|     "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n", | ||||
|     "\n", | ||||
|     "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", | ||||
|     "try:\n", | ||||
|     "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", | ||||
|     "except urllib.error.HTTPError:\n", | ||||
|     "    print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n", | ||||
|     "          \" temporary unavailable. Using backup URL.\")\n", | ||||
|     "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", | ||||
|     "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", | ||||
|     "\n", | ||||
|     "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n", | ||||
|     "balanced_df = create_balanced_dataset(df)\n", | ||||
|  | ||||
| @ -50,7 +50,7 @@ | ||||
|      "text": [ | ||||
|       "matplotlib version: 3.10.0\n", | ||||
|       "numpy version: 2.0.2\n", | ||||
|       "tiktoken version: 0.8.0\n", | ||||
|       "tiktoken version: 0.9.0\n", | ||||
|       "torch version: 2.6.0\n", | ||||
|       "tensorflow version: 2.18.0\n", | ||||
|       "pandas version: 2.2.3\n" | ||||
| @ -167,7 +167,7 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 3, | ||||
|    "execution_count": 2, | ||||
|    "id": "def7c09b-af9c-4216-90ce-5e67aed1065c", | ||||
|    "metadata": { | ||||
|     "colab": { | ||||
| @ -181,7 +181,7 @@ | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n" | ||||
|       "File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv\n" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
| @ -215,7 +215,13 @@ | ||||
|     "    os.rename(original_file_path, data_file_path)\n", | ||||
|     "    print(f\"File downloaded and saved as {data_file_path}\")\n", | ||||
|     "\n", | ||||
|     "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)" | ||||
|     "try:\n", | ||||
|     "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n", | ||||
|     "except urllib.error.HTTPError:\n", | ||||
|     "    print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n", | ||||
|     "          \" temporary unavailable. Using backup URL.\")\n", | ||||
|     "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n", | ||||
|     "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) " | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|  | ||||
| @ -276,7 +276,16 @@ if __name__ == "__main__": | ||||
|     extracted_path = "sms_spam_collection" | ||||
|     data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv" | ||||
| 
 | ||||
|     download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode) | ||||
|     try: | ||||
|         download_and_unzip_spam_data( | ||||
|             url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode | ||||
|         ) | ||||
|     except urllib.error.HTTPError: | ||||
|         backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" | ||||
|         download_and_unzip_spam_data( | ||||
|             backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode | ||||
|         ) | ||||
| 
 | ||||
|     df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"]) | ||||
|     balanced_df = create_balanced_dataset(df) | ||||
|     balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1}) | ||||
|  | ||||
| @ -603,7 +603,11 @@ if __name__ == "__main__": | ||||
|     all_exist = all((base_path / file_name).exists() for file_name in file_names) | ||||
| 
 | ||||
|     if not all_exist: | ||||
|         download_and_unzip(url, zip_path, extract_to, new_file_path) | ||||
|         try: | ||||
|             download_and_unzip(url, zip_path, extract_to, new_file_path) | ||||
|         except urllib.error.HTTPError: | ||||
|             backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" | ||||
|             download_and_unzip(backup_url, zip_path, extract_to, new_file_path) | ||||
|         create_dataset_csvs(new_file_path) | ||||
| 
 | ||||
|     tokenizer = tiktoken.get_encoding("gpt2") | ||||
|  | ||||
| @ -410,7 +410,11 @@ if __name__ == "__main__": | ||||
|     all_exist = all((base_path / file_name).exists() for file_name in file_names) | ||||
| 
 | ||||
|     if not all_exist: | ||||
|         download_and_unzip(url, zip_path, extract_to, new_file_path) | ||||
|         try: | ||||
|             download_and_unzip(url, zip_path, extract_to, new_file_path) | ||||
|         except urllib.error.HTTPError: | ||||
|             backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" | ||||
|             download_and_unzip(backup_url, zip_path, extract_to, new_file_path) | ||||
|         create_dataset_csvs(new_file_path) | ||||
| 
 | ||||
|     if args.use_attention_mask.lower() == "true": | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Sebastian Raschka
						Sebastian Raschka