{ "cells": [ { "cell_type": "code", "execution_count": 75, "id": "b612c4c1-fa3c-47b9-a8ce-9e32f371e160", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n" ] } ], "source": [ "import urllib.request\n", "import zipfile\n", "import os\n", "from pathlib import Path\n", "\n", "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n", "zip_path = \"sms_spam_collection.zip\"\n", "extract_to = \"sms_spam_collection\"\n", "new_file_path = Path(extract_to) / \"SMSSpamCollection.tsv\"\n", "\n", "def download_and_unzip(url, zip_path, extract_to, new_file_path):\n", " # Check if the target file already exists\n", " if new_file_path.exists():\n", " print(f\"{new_file_path} already exists. Skipping download and extraction.\")\n", " return\n", "\n", " # Downloading the file\n", " with urllib.request.urlopen(url) as response:\n", " with open(zip_path, \"wb\") as out_file:\n", " out_file.write(response.read())\n", "\n", " # Unzipping the file\n", " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n", " zip_ref.extractall(extract_to)\n", "\n", " # Renaming the file to indicate its format\n", " original_file = Path(extract_to) / \"SMSSpamCollection\"\n", " os.rename(original_file, new_file_path)\n", " print(f\"File download and saved as {new_file_path}\")\n", "\n", "# Execute the function\n", "download_and_unzip_spam_data(url, zip_path, extract_to, new_file_path)" ] }, { "cell_type": "code", "execution_count": 76, "id": "69f32433-e19c-4066-b806-8f30b408107f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Label | \n", "Text | \n", "
---|---|---|
0 | \n", "ham | \n", "Aight text me when you're back at mu and I'll ... | \n", "
1 | \n", "ham | \n", "Our Prashanthettan's mother passed away last n... | \n", "
2 | \n", "ham | \n", "No it will reach by 9 only. She telling she wi... | \n", "
3 | \n", "ham | \n", "Do you know when the result. | \n", "
4 | \n", "spam | \n", "Hi. Customer Loyalty Offer:The NEW Nokia6650 M... | \n", "
... | \n", "... | \n", "... | \n", "
5567 | \n", "ham | \n", "I accidentally brought em home in the box | \n", "
5568 | \n", "spam | \n", "Moby Pub Quiz.Win a £100 High Street prize if ... | \n", "
5569 | \n", "ham | \n", "Que pases un buen tiempo or something like that | \n", "
5570 | \n", "ham | \n", "Nowadays people are notixiquating the laxinorf... | \n", "
5571 | \n", "ham | \n", "Ard 4 lor... | \n", "
5572 rows × 2 columns
\n", "