519 lines
16 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 75,
"id": "b612c4c1-fa3c-47b9-a8ce-9e32f371e160",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n"
]
}
],
"source": [
"import urllib.request\n",
"import zipfile\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
"zip_path = \"sms_spam_collection.zip\"\n",
"extract_to = \"sms_spam_collection\"\n",
"new_file_path = Path(extract_to) / \"SMSSpamCollection.tsv\"\n",
"\n",
"def download_and_unzip(url, zip_path, extract_to, new_file_path):\n",
" # Check if the target file already exists\n",
" if new_file_path.exists():\n",
" print(f\"{new_file_path} already exists. Skipping download and extraction.\")\n",
" return\n",
"\n",
" # Downloading the file\n",
" with urllib.request.urlopen(url) as response:\n",
" with open(zip_path, \"wb\") as out_file:\n",
" out_file.write(response.read())\n",
"\n",
" # Unzipping the file\n",
" with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
" zip_ref.extractall(extract_to)\n",
"\n",
" # Renaming the file to indicate its format\n",
" original_file = Path(extract_to) / \"SMSSpamCollection\"\n",
" os.rename(original_file, new_file_path)\n",
" print(f\"File download and saved as {new_file_path}\")\n",
"\n",
"# Execute the function\n",
"download_and_unzip_spam_data(url, zip_path, extract_to, new_file_path)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "69f32433-e19c-4066-b806-8f30b408107f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Label</th>\n",
" <th>Text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Aight text me when you're back at mu and I'll ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Our Prashanthettan's mother passed away last n...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ham</td>\n",
" <td>No it will reach by 9 only. She telling she wi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>Do you know when the result.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>spam</td>\n",
" <td>Hi. Customer Loyalty Offer:The NEW Nokia6650 M...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5567</th>\n",
" <td>ham</td>\n",
" <td>I accidentally brought em home in the box</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5568</th>\n",
" <td>spam</td>\n",
" <td>Moby Pub Quiz.Win a £100 High Street prize if ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5569</th>\n",
" <td>ham</td>\n",
" <td>Que pases un buen tiempo or something like that</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5570</th>\n",
" <td>ham</td>\n",
" <td>Nowadays people are notixiquating the laxinorf...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5571</th>\n",
" <td>ham</td>\n",
" <td>Ard 4 lor...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5572 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Label Text\n",
"0 ham Aight text me when you're back at mu and I'll ...\n",
"1 ham Our Prashanthettan's mother passed away last n...\n",
"2 ham No it will reach by 9 only. She telling she wi...\n",
"3 ham Do you know when the result.\n",
"4 spam Hi. Customer Loyalty Offer:The NEW Nokia6650 M...\n",
"... ... ...\n",
"5567 ham I accidentally brought em home in the box\n",
"5568 spam Moby Pub Quiz.Win a £100 High Street prize if ...\n",
"5569 ham Que pases un buen tiempo or something like that\n",
"5570 ham Nowadays people are notixiquating the laxinorf...\n",
"5571 ham Ard 4 lor...\n",
"\n",
"[5572 rows x 2 columns]"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(new_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
"df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "4b7beeba-9f3a-45f0-b2dc-76bb155a8f0e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label\n",
"ham 4825\n",
"spam 747\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Class distribution\n",
"print(df[\"Label\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "b3db862a-9e03-4715-babb-9b699e4f4a36",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label\n",
"spam 747\n",
"ham 747\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Count the instances of 'spam'\n",
"n_spam = df[df[\"Label\"] == \"spam\"].shape[0]\n",
"\n",
"# Randomly sample 'ham' instances to match the number of 'spam' instances\n",
"ham_sampled = df[df[\"Label\"] == \"ham\"].sample(n_spam)\n",
"\n",
"# Combine the sampled 'ham' with all 'spam'\n",
"balanced_df = pd.concat([ham_sampled, df[df[\"Label\"] == \"spam\"]])\n",
"\n",
"# Shuffle the DataFrame\n",
"balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)\n",
"\n",
"# Now balanced_df is the balanced DataFrame\n",
"print(balanced_df[\"Label\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "0af991e5-98ef-439a-a43d-63a581a2cc6d",
"metadata": {},
"outputs": [],
"source": [
"df[\"Label\"] = df[\"Label\"].map({\"ham\": 0, \"spam\": 1})"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "2f5b00ef-e3ed-4819-b271-5f355848feb5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training set:\n",
"Label\n",
"0 0.86612\n",
"1 0.13388\n",
"Name: proportion, dtype: float64\n",
"\n",
"Validation set:\n",
"Label\n",
"0 0.866906\n",
"1 0.133094\n",
"Name: proportion, dtype: float64\n",
"\n",
"Test set:\n",
"Label\n",
"0 0.864816\n",
"1 0.135184\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"# Define split ratios\n",
"train_size, validation_size = 0.7, 0.1\n",
"# Test size is implied to be 0.2 as the remainder\n",
"\n",
"# Split the data\n",
"def stratified_split(df, stratify_col, train_frac, validation_frac):\n",
" stratified_train = pd.DataFrame()\n",
" stratified_validation = pd.DataFrame()\n",
" stratified_test = pd.DataFrame()\n",
"\n",
" # Stratify split by the unique values in the column\n",
" for value in df[stratify_col].unique():\n",
" # Filter the DataFrame for the class\n",
" df_class = df[df[stratify_col] == value]\n",
" \n",
" # Calculate class split sizes\n",
" train_end = int(len(df_class) * train_frac)\n",
" validation_end = train_end + int(len(df_class) * validation_frac)\n",
" \n",
" # Slice the DataFrame to get the sets\n",
" stratified_train = pd.concat([stratified_train, df_class[:train_end]], axis=0)\n",
" stratified_validation = pd.concat([stratified_validation, df_class[train_end:validation_end]], axis=0)\n",
" stratified_test = pd.concat([stratified_test, df_class[validation_end:]], axis=0)\n",
"\n",
" # Shuffle the sets again\n",
" stratified_train = stratified_train.sample(frac=1, random_state=123).reset_index(drop=True)\n",
" stratified_validation = stratified_validation.sample(frac=1, random_state=123).reset_index(drop=True)\n",
" stratified_test = stratified_test.sample(frac=1, random_state=123).reset_index(drop=True)\n",
"\n",
" return stratified_train, stratified_validation, stratified_test\n",
"\n",
"# Apply the stratified split function\n",
"train_df, validation_df, test_df = stratified_split(df, \"Label\", train_size, validation_size)\n",
"\n",
"# Check the results\n",
"print(f\"Training set:\\n{train_df['Label'].value_counts(normalize=True)}\")\n",
"print(f\"\\nValidation set:\\n{validation_df['Label'].value_counts(normalize=True)}\")\n",
"print(f\"\\nTest set:\\n{test_df['Label'].value_counts(normalize=True)}\")"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "65808167-2b93-45b0-8506-ce722732ce77",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training set:\n",
"Label\n",
"ham 0.5\n",
"spam 0.5\n",
"Name: proportion, dtype: float64\n",
"\n",
"Validation set:\n",
"Label\n",
"ham 0.5\n",
"spam 0.5\n",
"Name: proportion, dtype: float64\n",
"\n",
"Test set:\n",
"Label\n",
"spam 0.5\n",
"ham 0.5\n",
"Name: proportion, dtype: float64\n"
]
}
],
"source": [
"# Define split ratios\n",
"train_size, validation_size = 0.7, 0.1\n",
"# Test size is implied to be 0.2 as the remainder\n",
"\n",
"# Apply the stratified split function\n",
"train_df, validation_df, test_df = stratified_split(balanced_df, \"Label\", train_size, validation_size)\n",
"\n",
"# Check the results\n",
"print(f\"Training set:\\n{train_df['Label'].value_counts(normalize=True)}\")\n",
"print(f\"\\nValidation set:\\n{validation_df['Label'].value_counts(normalize=True)}\")\n",
"print(f\"\\nTest set:\\n{test_df['Label'].value_counts(normalize=True)}\")"
]
},
{
"cell_type": "markdown",
"id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d",
"metadata": {},
"source": [
"## Scikit-learn baseline"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "180318b7-de18-4b05-b84a-ba97c72b9d8e",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, balanced_accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "25090b7c-f516-4be2-8083-3a7187fe4635",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = CountVectorizer()\n",
"\n",
"X_train = vectorizer.fit_transform(train_df[\"Text\"])\n",
"X_val = vectorizer.transform(validation_df[\"Text\"])\n",
"X_test = vectorizer.transform(test_df[\"Text\"])\n",
"\n",
"y_train, y_val, y_test = train_df[\"Label\"], validation_df[\"Label\"], test_df[\"Label\"]"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "0247de3a-88f0-4b9c-becd-157baf3acf49",
"metadata": {},
"outputs": [],
"source": [
"def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):\n",
" # Making predictions\n",
" y_pred_train = model.predict(X_train)\n",
" y_pred_val = model.predict(X_val)\n",
" y_pred_test = model.predict(X_test)\n",
" \n",
" # Calculating accuracy and balanced accuracy\n",
" accuracy_train = accuracy_score(y_train, y_pred_train)\n",
" balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)\n",
" \n",
" accuracy_val = accuracy_score(y_val, y_pred_val)\n",
" balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)\n",
"\n",
" accuracy_test = accuracy_score(y_test, y_pred_test)\n",
" balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)\n",
" \n",
" # Printing the results\n",
" print(f\"Training Accuracy: {accuracy_train*100:.2f}%\")\n",
" print(f\"Validation Accuracy: {accuracy_val*100:.2f}%\")\n",
" print(f\"Test Accuracy: {accuracy_test*100:.2f}%\")\n",
" \n",
" print(f\"\\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%\")\n",
" print(f\"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%\")\n",
" print(f\"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%\")"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "c29c6dfc-f72d-40ab-8cb5-783aad1a15ab",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Accuracy: 50.00%\n",
"Validation Accuracy: 50.00%\n",
"Test Accuracy: 50.00%\n",
"\n",
"Training Balanced Accuracy: 50.00%\n",
"Validation Balanced Accuracy: 50.00%\n",
"Test Balanced Accuracy: 50.00%\n"
]
}
],
"source": [
"from sklearn.dummy import DummyClassifier\n",
"\n",
"# Create a dummy classifier with the strategy to predict the most frequent class\n",
"dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n",
"dummy_clf.fit(X_train, y_train)\n",
"\n",
"eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "088a8a3a-3b74-4d10-a51b-cb662569ae39",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Accuracy: 99.81%\n",
"Validation Accuracy: 95.27%\n",
"Test Accuracy: 96.03%\n",
"\n",
"Training Balanced Accuracy: 99.81%\n",
"Validation Balanced Accuracy: 95.27%\n",
"Test Balanced Accuracy: 96.03%\n"
]
}
],
"source": [
"model = LogisticRegression(max_iter=1000)\n",
"model.fit(X_train, y_train)\n",
"eval(model, X_train, y_train, X_val, y_val, X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34411348-45bc-4b01-bebf-b3602c002ef1",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a9bc6b1-c8b9-4d4f-bfe4-c5a4a8b0c756",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}