mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-08 08:42:56 +00:00
76 lines
2.8 KiB
Python
76 lines
2.8 KiB
Python
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
|
# Source for "Build a Large Language Model From Scratch"
|
|
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
|
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import accuracy_score
|
|
# from sklearn.metrics import balanced_accuracy_score
|
|
from sklearn.dummy import DummyClassifier
|
|
|
|
|
|
def load_dataframes():
|
|
df_train = pd.read_csv("train.csv")
|
|
df_val = pd.read_csv("validation.csv")
|
|
df_test = pd.read_csv("test.csv")
|
|
|
|
return df_train, df_val, df_test
|
|
|
|
|
|
def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):
|
|
# Making predictions
|
|
y_pred_train = model.predict(X_train)
|
|
y_pred_val = model.predict(X_val)
|
|
y_pred_test = model.predict(X_test)
|
|
|
|
# Calculating accuracy and balanced accuracy
|
|
accuracy_train = accuracy_score(y_train, y_pred_train)
|
|
# balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
|
|
|
|
accuracy_val = accuracy_score(y_val, y_pred_val)
|
|
# balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)
|
|
|
|
accuracy_test = accuracy_score(y_test, y_pred_test)
|
|
# balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)
|
|
|
|
# Printing the results
|
|
print(f"Training Accuracy: {accuracy_train*100:.2f}%")
|
|
print(f"Validation Accuracy: {accuracy_val*100:.2f}%")
|
|
print(f"Test Accuracy: {accuracy_test*100:.2f}%")
|
|
|
|
# print(f"\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%")
|
|
# print(f"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%")
|
|
# print(f"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
df_train, df_val, df_test = load_dataframes()
|
|
|
|
#########################################
|
|
# Convert text into bag-of-words model
|
|
vectorizer = CountVectorizer()
|
|
#########################################
|
|
|
|
X_train = vectorizer.fit_transform(df_train["text"])
|
|
X_val = vectorizer.transform(df_val["text"])
|
|
X_test = vectorizer.transform(df_test["text"])
|
|
y_train, y_val, y_test = df_train["label"], df_val["label"], df_test["label"]
|
|
|
|
#####################################
|
|
# Model training and evaluation
|
|
#####################################
|
|
|
|
# Create a dummy classifier with the strategy to predict the most frequent class
|
|
dummy_clf = DummyClassifier(strategy="most_frequent")
|
|
dummy_clf.fit(X_train, y_train)
|
|
|
|
print("Dummy classifier:")
|
|
eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)
|
|
|
|
print("\n\nLogistic regression classifier:")
|
|
model = LogisticRegression(max_iter=1000)
|
|
model.fit(X_train, y_train)
|
|
eval(model, X_train, y_train, X_val, y_val, X_test, y_test)
|