LLMs-from-scratch/ch06/03_bonus_imdb-classification/sklearn-baseline.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "b612c4c1-fa3c-47b9-a8ce-9e32f371e160",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n"
     ]
    }
   ],
   "source": [
    "import urllib.request\n",
    "import zipfile\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
    "zip_path = \"sms_spam_collection.zip\"\n",
    "extract_to = \"sms_spam_collection\"\n",
    "new_file_path = Path(extract_to) / \"SMSSpamCollection.tsv\"\n",
    "\n",
    "def download_and_unzip(url, zip_path, extract_to, new_file_path):\n",
    "    # Check if the target file already exists\n",
    "    if new_file_path.exists():\n",
    "        print(f\"{new_file_path} already exists. Skipping download and extraction.\")\n",
    "        return\n",
    "\n",
    "    # Downloading the file\n",
    "    with urllib.request.urlopen(url) as response:\n",
    "        with open(zip_path, \"wb\") as out_file:\n",
    "            out_file.write(response.read())\n",
    "\n",
    "    # Unzipping the file\n",
    "    with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
    "        zip_ref.extractall(extract_to)\n",
    "\n",
    "    # Renaming the file to indicate its format\n",
    "    original_file = Path(extract_to) / \"SMSSpamCollection\"\n",
    "    os.rename(original_file, new_file_path)\n",
    "    print(f\"File download and saved as {new_file_path}\")\n",
    "\n",
    "# Execute the function\n",
    "download_and_unzip_spam_data(url, zip_path, extract_to, new_file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "69f32433-e19c-4066-b806-8f30b408107f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Label</th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ham</td>\n",
       "      <td>Aight text me when you're back at mu and I'll ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ham</td>\n",
       "      <td>Our Prashanthettan's mother passed away last n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ham</td>\n",
       "      <td>No it will reach by 9 only. She telling she wi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ham</td>\n",
       "      <td>Do you know when the result.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>spam</td>\n",
       "      <td>Hi. Customer Loyalty Offer:The NEW Nokia6650 M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5567</th>\n",
       "      <td>ham</td>\n",
       "      <td>I accidentally brought em home in the box</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5568</th>\n",
       "      <td>spam</td>\n",
       "      <td>Moby Pub Quiz.Win a £100 High Street prize if ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5569</th>\n",
       "      <td>ham</td>\n",
       "      <td>Que pases un buen tiempo or something like that</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5570</th>\n",
       "      <td>ham</td>\n",
       "      <td>Nowadays people are notixiquating the laxinorf...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5571</th>\n",
       "      <td>ham</td>\n",
       "      <td>Ard 4 lor...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5572 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Label                                               Text\n",
       "0      ham  Aight text me when you're back at mu and I'll ...\n",
       "1      ham  Our Prashanthettan's mother passed away last n...\n",
       "2      ham  No it will reach by 9 only. She telling she wi...\n",
       "3      ham                       Do you know when the result.\n",
       "4     spam  Hi. Customer Loyalty Offer:The NEW Nokia6650 M...\n",
       "...    ...                                                ...\n",
       "5567   ham          I accidentally brought em home in the box\n",
       "5568  spam  Moby Pub Quiz.Win a £100 High Street prize if ...\n",
       "5569   ham    Que pases un buen tiempo or something like that\n",
       "5570   ham  Nowadays people are notixiquating the laxinorf...\n",
       "5571   ham                                       Ard 4 lor...\n",
       "\n",
       "[5572 rows x 2 columns]"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(new_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
    "df = df.sample(frac=1, random_state=123).reset_index(drop=True)  # Shuffle the DataFrame\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "4b7beeba-9f3a-45f0-b2dc-76bb155a8f0e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Label\n",
      "ham     4825\n",
      "spam     747\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Class distribution\n",
    "print(df[\"Label\"].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "b3db862a-9e03-4715-babb-9b699e4f4a36",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Label\n",
      "spam    747\n",
      "ham     747\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Count the instances of 'spam'\n",
    "n_spam = df[df[\"Label\"] == \"spam\"].shape[0]\n",
    "\n",
    "# Randomly sample 'ham' instances to match the number of 'spam' instances\n",
    "ham_sampled = df[df[\"Label\"] == \"ham\"].sample(n_spam)\n",
    "\n",
    "# Combine the sampled 'ham' with all 'spam'\n",
    "balanced_df = pd.concat([ham_sampled, df[df[\"Label\"] == \"spam\"]])\n",
    "\n",
    "# Shuffle the DataFrame\n",
    "balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)\n",
    "\n",
    "# Now balanced_df is the balanced DataFrame\n",
    "print(balanced_df[\"Label\"].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "0af991e5-98ef-439a-a43d-63a581a2cc6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"Label\"] = df[\"Label\"].map({\"ham\": 0, \"spam\": 1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "2f5b00ef-e3ed-4819-b271-5f355848feb5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set:\n",
      "Label\n",
      "0    0.86612\n",
      "1    0.13388\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Validation set:\n",
      "Label\n",
      "0    0.866906\n",
      "1    0.133094\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Test set:\n",
      "Label\n",
      "0    0.864816\n",
      "1    0.135184\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Define split ratios\n",
    "train_size, validation_size = 0.7, 0.1\n",
    "# Test size is implied to be 0.2 as the remainder\n",
    "\n",
    "# Split the data\n",
    "def stratified_split(df, stratify_col, train_frac, validation_frac):\n",
    "    stratified_train = pd.DataFrame()\n",
    "    stratified_validation = pd.DataFrame()\n",
    "    stratified_test = pd.DataFrame()\n",
    "\n",
    "    # Stratify split by the unique values in the column\n",
    "    for value in df[stratify_col].unique():\n",
    "        # Filter the DataFrame for the class\n",
    "        df_class = df[df[stratify_col] == value]\n",
    "        \n",
    "        # Calculate class split sizes\n",
    "        train_end = int(len(df_class) * train_frac)\n",
    "        validation_end = train_end + int(len(df_class) * validation_frac)\n",
    "        \n",
    "        # Slice the DataFrame to get the sets\n",
    "        stratified_train = pd.concat([stratified_train, df_class[:train_end]], axis=0)\n",
    "        stratified_validation = pd.concat([stratified_validation, df_class[train_end:validation_end]], axis=0)\n",
    "        stratified_test = pd.concat([stratified_test, df_class[validation_end:]], axis=0)\n",
    "\n",
    "    # Shuffle the sets again\n",
    "    stratified_train = stratified_train.sample(frac=1, random_state=123).reset_index(drop=True)\n",
    "    stratified_validation = stratified_validation.sample(frac=1, random_state=123).reset_index(drop=True)\n",
    "    stratified_test = stratified_test.sample(frac=1, random_state=123).reset_index(drop=True)\n",
    "\n",
    "    return stratified_train, stratified_validation, stratified_test\n",
    "\n",
    "# Apply the stratified split function\n",
    "train_df, validation_df, test_df = stratified_split(df, \"Label\", train_size, validation_size)\n",
    "\n",
    "# Check the results\n",
    "print(f\"Training set:\\n{train_df['Label'].value_counts(normalize=True)}\")\n",
    "print(f\"\\nValidation set:\\n{validation_df['Label'].value_counts(normalize=True)}\")\n",
    "print(f\"\\nTest set:\\n{test_df['Label'].value_counts(normalize=True)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "65808167-2b93-45b0-8506-ce722732ce77",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set:\n",
      "Label\n",
      "ham     0.5\n",
      "spam    0.5\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Validation set:\n",
      "Label\n",
      "ham     0.5\n",
      "spam    0.5\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Test set:\n",
      "Label\n",
      "spam    0.5\n",
      "ham     0.5\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# Define split ratios\n",
    "train_size, validation_size = 0.7, 0.1\n",
    "# Test size is implied to be 0.2 as the remainder\n",
    "\n",
    "# Apply the stratified split function\n",
    "train_df, validation_df, test_df = stratified_split(balanced_df, \"Label\", train_size, validation_size)\n",
    "\n",
    "# Check the results\n",
    "print(f\"Training set:\\n{train_df['Label'].value_counts(normalize=True)}\")\n",
    "print(f\"\\nValidation set:\\n{validation_df['Label'].value_counts(normalize=True)}\")\n",
    "print(f\"\\nTest set:\\n{test_df['Label'].value_counts(normalize=True)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d",
   "metadata": {},
   "source": [
    "## Scikit-learn baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "180318b7-de18-4b05-b84a-ba97c72b9d8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, balanced_accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "25090b7c-f516-4be2-8083-3a7187fe4635",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer()\n",
    "\n",
    "X_train = vectorizer.fit_transform(train_df[\"Text\"])\n",
    "X_val = vectorizer.transform(validation_df[\"Text\"])\n",
    "X_test = vectorizer.transform(test_df[\"Text\"])\n",
    "\n",
    "y_train, y_val, y_test = train_df[\"Label\"], validation_df[\"Label\"], test_df[\"Label\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "0247de3a-88f0-4b9c-becd-157baf3acf49",
   "metadata": {},
   "outputs": [],
   "source": [
    "def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):\n",
    "    # Making predictions\n",
    "    y_pred_train = model.predict(X_train)\n",
    "    y_pred_val = model.predict(X_val)\n",
    "    y_pred_test = model.predict(X_test)\n",
    "    \n",
    "    # Calculating accuracy and balanced accuracy\n",
    "    accuracy_train = accuracy_score(y_train, y_pred_train)\n",
    "    balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)\n",
    "    \n",
    "    accuracy_val = accuracy_score(y_val, y_pred_val)\n",
    "    balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)\n",
    "\n",
    "    accuracy_test = accuracy_score(y_test, y_pred_test)\n",
    "    balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)\n",
    "    \n",
    "    # Printing the results\n",
    "    print(f\"Training Accuracy: {accuracy_train*100:.2f}%\")\n",
    "    print(f\"Validation Accuracy: {accuracy_val*100:.2f}%\")\n",
    "    print(f\"Test Accuracy: {accuracy_test*100:.2f}%\")\n",
    "    \n",
    "    print(f\"\\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%\")\n",
    "    print(f\"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%\")\n",
    "    print(f\"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "c29c6dfc-f72d-40ab-8cb5-783aad1a15ab",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Accuracy: 50.00%\n",
      "Validation Accuracy: 50.00%\n",
      "Test Accuracy: 50.00%\n",
      "\n",
      "Training Balanced Accuracy: 50.00%\n",
      "Validation Balanced Accuracy: 50.00%\n",
      "Test Balanced Accuracy: 50.00%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.dummy import DummyClassifier\n",
    "\n",
    "# Create a dummy classifier with the strategy to predict the most frequent class\n",
    "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n",
    "dummy_clf.fit(X_train, y_train)\n",
    "\n",
    "eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "088a8a3a-3b74-4d10-a51b-cb662569ae39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Accuracy: 99.81%\n",
      "Validation Accuracy: 95.27%\n",
      "Test Accuracy: 96.03%\n",
      "\n",
      "Training Balanced Accuracy: 99.81%\n",
      "Validation Balanced Accuracy: 95.27%\n",
      "Test Balanced Accuracy: 96.03%\n"
     ]
    }
   ],
   "source": [
    "model = LogisticRegression(max_iter=1000)\n",
    "model.fit(X_train, y_train)\n",
    "eval(model, X_train, y_train, X_val, y_val, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34411348-45bc-4b01-bebf-b3602c002ef1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a9bc6b1-c8b9-4d4f-bfe4-c5a4a8b0c756",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}