mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-09-01 12:27:59 +00:00
Add backup url for Spam Dataset (#543)
* Add backup url for Spam Dataset * import urllib * fix url
This commit is contained in:
parent
2254102270
commit
eb6787397c
@ -48,12 +48,12 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"matplotlib version: 3.7.2\n",
|
"matplotlib version: 3.10.0\n",
|
||||||
"numpy version: 1.25.2\n",
|
"numpy version: 2.0.2\n",
|
||||||
"tiktoken version: 0.5.1\n",
|
"tiktoken version: 0.9.0\n",
|
||||||
"torch version: 2.2.2\n",
|
"torch version: 2.6.0\n",
|
||||||
"tensorflow version: 2.15.0\n",
|
"tensorflow version: 2.18.0\n",
|
||||||
"pandas version: 2.0.3\n"
|
"pandas version: 2.2.3\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -190,6 +190,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import urllib\n",
|
||||||
"from pathlib import Path\n",
|
"from pathlib import Path\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from previous_chapters import (\n",
|
"from previous_chapters import (\n",
|
||||||
@ -204,7 +205,13 @@
|
|||||||
"extracted_path = \"sms_spam_collection\"\n",
|
"extracted_path = \"sms_spam_collection\"\n",
|
||||||
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
"try:\n",
|
||||||
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
|
"except urllib.error.HTTPError:\n",
|
||||||
|
" print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n",
|
||||||
|
" \" temporary unavailable. Using backup URL.\")\n",
|
||||||
|
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||||
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
||||||
"balanced_df = create_balanced_dataset(df)\n",
|
"balanced_df = create_balanced_dataset(df)\n",
|
||||||
|
@ -50,7 +50,7 @@
|
|||||||
"text": [
|
"text": [
|
||||||
"matplotlib version: 3.10.0\n",
|
"matplotlib version: 3.10.0\n",
|
||||||
"numpy version: 2.0.2\n",
|
"numpy version: 2.0.2\n",
|
||||||
"tiktoken version: 0.8.0\n",
|
"tiktoken version: 0.9.0\n",
|
||||||
"torch version: 2.6.0\n",
|
"torch version: 2.6.0\n",
|
||||||
"tensorflow version: 2.18.0\n",
|
"tensorflow version: 2.18.0\n",
|
||||||
"pandas version: 2.2.3\n"
|
"pandas version: 2.2.3\n"
|
||||||
@ -167,7 +167,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 2,
|
||||||
"id": "def7c09b-af9c-4216-90ce-5e67aed1065c",
|
"id": "def7c09b-af9c-4216-90ce-5e67aed1065c",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
@ -181,7 +181,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n"
|
"File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -215,7 +215,13 @@
|
|||||||
" os.rename(original_file_path, data_file_path)\n",
|
" os.rename(original_file_path, data_file_path)\n",
|
||||||
" print(f\"File downloaded and saved as {data_file_path}\")\n",
|
" print(f\"File downloaded and saved as {data_file_path}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)"
|
"try:\n",
|
||||||
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
||||||
|
"except urllib.error.HTTPError:\n",
|
||||||
|
" print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n",
|
||||||
|
" \" temporary unavailable. Using backup URL.\")\n",
|
||||||
|
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
||||||
|
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -276,7 +276,16 @@ if __name__ == "__main__":
|
|||||||
extracted_path = "sms_spam_collection"
|
extracted_path = "sms_spam_collection"
|
||||||
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
|
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
|
||||||
|
|
||||||
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode)
|
try:
|
||||||
|
download_and_unzip_spam_data(
|
||||||
|
url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
|
||||||
|
)
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
|
download_and_unzip_spam_data(
|
||||||
|
backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
|
||||||
|
)
|
||||||
|
|
||||||
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
|
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
|
||||||
balanced_df = create_balanced_dataset(df)
|
balanced_df = create_balanced_dataset(df)
|
||||||
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
|
||||||
|
@ -603,7 +603,11 @@ if __name__ == "__main__":
|
|||||||
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
||||||
|
|
||||||
if not all_exist:
|
if not all_exist:
|
||||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
try:
|
||||||
|
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
|
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
||||||
create_dataset_csvs(new_file_path)
|
create_dataset_csvs(new_file_path)
|
||||||
|
|
||||||
tokenizer = tiktoken.get_encoding("gpt2")
|
tokenizer = tiktoken.get_encoding("gpt2")
|
||||||
|
@ -410,7 +410,11 @@ if __name__ == "__main__":
|
|||||||
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
all_exist = all((base_path / file_name).exists() for file_name in file_names)
|
||||||
|
|
||||||
if not all_exist:
|
if not all_exist:
|
||||||
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
try:
|
||||||
|
download_and_unzip(url, zip_path, extract_to, new_file_path)
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
|
||||||
|
download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
|
||||||
create_dataset_csvs(new_file_path)
|
create_dataset_csvs(new_file_path)
|
||||||
|
|
||||||
if args.use_attention_mask.lower() == "true":
|
if args.use_attention_mask.lower() == "true":
|
||||||
|
Loading…
x
Reference in New Issue
Block a user