Sebastian Raschka 564362044a
add BERT experiment results (#333)
* add BERT experiment results

* cleanup

* formatting
2024-08-23 08:40:40 -05:00

85 lines
3.1 KiB
Python

# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch
import os
import sys
import tarfile
import time
import urllib.request
import pandas as pd
def reporthook(count, block_size, total_size):
global start_time
if count == 0:
start_time = time.time()
else:
duration = time.time() - start_time
progress_size = int(count * block_size)
percent = count * block_size * 100 / total_size
speed = int(progress_size / (1024 * duration)) if duration else 0
sys.stdout.write(
f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB "
f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
)
sys.stdout.flush()
def download_and_extract_dataset(dataset_url, target_file, directory):
if not os.path.exists(directory):
if os.path.exists(target_file):
os.remove(target_file)
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
print("\nExtracting dataset ...")
with tarfile.open(target_file, "r:gz") as tar:
tar.extractall()
else:
print(f"Directory `{directory}` already exists. Skipping download.")
def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg": 0}):
data_frames = [] # List to store each chunk of DataFrame
for subset in ("test", "train"):
for label in ("pos", "neg"):
path = os.path.join(basepath, subset, label)
for file in sorted(os.listdir(path)):
with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
# Create a DataFrame for each file and add it to the list
data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]}))
# Concatenate all DataFrame chunks together
df = pd.concat(data_frames, ignore_index=True)
df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame
return df
def partition_and_save(df, sizes=(35000, 5000, 10000)):
# Shuffle the DataFrame
df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
# Get indices for where to split the data
train_end = sizes[0]
val_end = sizes[0] + sizes[1]
# Split the DataFrame
train = df_shuffled.iloc[:train_end]
val = df_shuffled.iloc[train_end:val_end]
test = df_shuffled.iloc[val_end:]
# Save to CSV files
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)
if __name__ == "__main__":
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
print("Downloading dataset ...")
download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb")
print("Creating data frames ...")
df = load_dataset_to_dataframe()
print("Partitioning and saving data frames ...")
partition_and_save(df)