{ "cells": [ { "cell_type": "markdown", "id": "8968a681-2db1-4840-bb73-7d6c95986825", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", "
\n", "
\n", "\n", "
" ] }, { "cell_type": "markdown", "id": "8b6e1cdd-b14e-4368-bdbb-9bf7ab821791", "metadata": {}, "source": [ "# Scikit-learn Logistic Regression Model" ] }, { "cell_type": "code", "execution_count": 1, "id": "c2a72242-6197-4bef-aa05-696a152350d5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100% | 80.23 MB | 4.37 MB/s | 18.38 sec elapsed" ] } ], "source": [ "!python download-prepare-dataset.py" ] }, { "cell_type": "code", "execution_count": 14, "id": "69f32433-e19c-4066-b806-8f30b408107f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "train_df = pd.read_csv(\"train.csv\")\n", "val_df = pd.read_csv(\"validation.csv\")\n", "test_df = pd.read_csv(\"test.csv\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "0808b212-fe91-48d9-80b8-55519f8835d5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
0The only reason I saw \"Shakedown\" was that it ...0
1This is absolute drivel, designed to shock and...0
2Lots of scenes and dialogue are flat-out goofy...1
3** and 1/2 stars out of **** Lifeforce is one ...1
4I learned a thing: you have to take this film ...1
\n", "
" ], "text/plain": [ " text label\n", "0 The only reason I saw \"Shakedown\" was that it ... 0\n", "1 This is absolute drivel, designed to shock and... 0\n", "2 Lots of scenes and dialogue are flat-out goofy... 1\n", "3 ** and 1/2 stars out of **** Lifeforce is one ... 1\n", "4 I learned a thing: you have to take this film ... 1" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "markdown", "id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d", "metadata": {}, "source": [ "## Scikit-learn baseline" ] }, { "cell_type": "code", "execution_count": 17, "id": "180318b7-de18-4b05-b84a-ba97c72b9d8e", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 20, "id": "25090b7c-f516-4be2-8083-3a7187fe4635", "metadata": {}, "outputs": [], "source": [ "vectorizer = CountVectorizer()\n", "\n", "X_train = vectorizer.fit_transform(train_df[\"text\"])\n", "X_val = vectorizer.transform(val_df[\"text\"])\n", "X_test = vectorizer.transform(test_df[\"text\"])\n", "\n", "y_train, y_val, y_test = train_df[\"label\"], val_df[\"label\"], test_df[\"label\"]" ] }, { "cell_type": "code", "execution_count": 22, "id": "0247de3a-88f0-4b9c-becd-157baf3acf49", "metadata": {}, "outputs": [], "source": [ "def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):\n", " # Making predictions\n", " y_pred_train = model.predict(X_train)\n", " y_pred_val = model.predict(X_val)\n", " y_pred_test = model.predict(X_test)\n", " \n", " # Calculating accuracy and balanced accuracy\n", " accuracy_train = accuracy_score(y_train, y_pred_train)\n", " balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)\n", " \n", " accuracy_val = accuracy_score(y_val, y_pred_val)\n", " balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)\n", "\n", " accuracy_test = accuracy_score(y_test, y_pred_test)\n", " balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)\n", " \n", " # Printing the results\n", " print(f\"Training Accuracy: {accuracy_train*100:.2f}%\")\n", " print(f\"Validation Accuracy: {accuracy_val*100:.2f}%\")\n", " print(f\"Test Accuracy: {accuracy_test*100:.2f}%\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "c29c6dfc-f72d-40ab-8cb5-783aad1a15ab", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training Accuracy: 50.01%\n", "Validation Accuracy: 50.14%\n", "Test Accuracy: 49.91%\n" ] } ], "source": [ "from sklearn.dummy import DummyClassifier\n", "\n", "# Create a dummy classifier with the strategy to predict the most frequent class\n", "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n", "dummy_clf.fit(X_train, y_train)\n", "\n", "eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 24, "id": "088a8a3a-3b74-4d10-a51b-cb662569ae39", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training Accuracy: 99.80%\n", "Validation Accuracy: 88.62%\n", "Test Accuracy: 88.85%\n" ] } ], "source": [ "model = LogisticRegression(max_iter=1000)\n", "model.fit(X_train, y_train)\n", "eval(model, X_train, y_train, X_val, y_val, X_test, y_test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }