mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-22 07:32:25 +00:00
85 lines
3.1 KiB
Python
85 lines
3.1 KiB
Python
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
|
# Source for "Build a Large Language Model From Scratch"
|
|
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
|
|
|
import os
|
|
import sys
|
|
import tarfile
|
|
import time
|
|
import urllib.request
|
|
import pandas as pd
|
|
|
|
|
|
def reporthook(count, block_size, total_size):
|
|
global start_time
|
|
if count == 0:
|
|
start_time = time.time()
|
|
else:
|
|
duration = time.time() - start_time
|
|
progress_size = int(count * block_size)
|
|
percent = count * block_size * 100 / total_size
|
|
|
|
speed = int(progress_size / (1024 * duration)) if duration else 0
|
|
sys.stdout.write(
|
|
f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB "
|
|
f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
|
|
)
|
|
sys.stdout.flush()
|
|
|
|
|
|
def download_and_extract_dataset(dataset_url, target_file, directory):
|
|
if not os.path.exists(directory):
|
|
if os.path.exists(target_file):
|
|
os.remove(target_file)
|
|
urllib.request.urlretrieve(dataset_url, target_file, reporthook)
|
|
print("\nExtracting dataset ...")
|
|
with tarfile.open(target_file, "r:gz") as tar:
|
|
tar.extractall()
|
|
else:
|
|
print(f"Directory `{directory}` already exists. Skipping download.")
|
|
|
|
|
|
def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg": 0}):
|
|
data_frames = [] # List to store each chunk of DataFrame
|
|
for subset in ("test", "train"):
|
|
for label in ("pos", "neg"):
|
|
path = os.path.join(basepath, subset, label)
|
|
for file in sorted(os.listdir(path)):
|
|
with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
|
|
# Create a DataFrame for each file and add it to the list
|
|
data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]}))
|
|
# Concatenate all DataFrame chunks together
|
|
df = pd.concat(data_frames, ignore_index=True)
|
|
df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame
|
|
return df
|
|
|
|
|
|
def partition_and_save(df, sizes=(35000, 5000, 10000)):
|
|
# Shuffle the DataFrame
|
|
df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
|
|
|
|
# Get indices for where to split the data
|
|
train_end = sizes[0]
|
|
val_end = sizes[0] + sizes[1]
|
|
|
|
# Split the DataFrame
|
|
train = df_shuffled.iloc[:train_end]
|
|
val = df_shuffled.iloc[train_end:val_end]
|
|
test = df_shuffled.iloc[val_end:]
|
|
|
|
# Save to CSV files
|
|
train.to_csv("train.csv", index=False)
|
|
val.to_csv("validation.csv", index=False)
|
|
test.to_csv("test.csv", index=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
|
|
print("Downloading dataset ...")
|
|
download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb")
|
|
print("Creating data frames ...")
|
|
df = load_dataset_to_dataframe()
|
|
print("Partitioning and saving data frames ...")
|
|
partition_and_save(df)
|