mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-11-04 03:40:21 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			85 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			85 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
 | 
						|
# Source for "Build a Large Language Model From Scratch"
 | 
						|
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
 | 
						|
# Code: https://github.com/rasbt/LLMs-from-scratch
 | 
						|
 | 
						|
import os
 | 
						|
import sys
 | 
						|
import tarfile
 | 
						|
import time
 | 
						|
import urllib.request
 | 
						|
import pandas as pd
 | 
						|
 | 
						|
 | 
						|
def reporthook(count, block_size, total_size):
 | 
						|
    global start_time
 | 
						|
    if count == 0:
 | 
						|
        start_time = time.time()
 | 
						|
    else:
 | 
						|
        duration = time.time() - start_time
 | 
						|
        progress_size = int(count * block_size)
 | 
						|
        percent = count * block_size * 100 / total_size
 | 
						|
 | 
						|
        speed = int(progress_size / (1024 * duration)) if duration else 0
 | 
						|
        sys.stdout.write(
 | 
						|
            f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB "
 | 
						|
            f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
 | 
						|
        )
 | 
						|
        sys.stdout.flush()
 | 
						|
 | 
						|
 | 
						|
def download_and_extract_dataset(dataset_url, target_file, directory):
 | 
						|
    if not os.path.exists(directory):
 | 
						|
        if os.path.exists(target_file):
 | 
						|
            os.remove(target_file)
 | 
						|
        urllib.request.urlretrieve(dataset_url, target_file, reporthook)
 | 
						|
        print("\nExtracting dataset ...")
 | 
						|
        with tarfile.open(target_file, "r:gz") as tar:
 | 
						|
            tar.extractall()
 | 
						|
    else:
 | 
						|
        print(f"Directory `{directory}` already exists. Skipping download.")
 | 
						|
 | 
						|
 | 
						|
def load_dataset_to_dataframe(basepath="aclImdb", labels={"pos": 1, "neg": 0}):
 | 
						|
    data_frames = []  # List to store each chunk of DataFrame
 | 
						|
    for subset in ("test", "train"):
 | 
						|
        for label in ("pos", "neg"):
 | 
						|
            path = os.path.join(basepath, subset, label)
 | 
						|
            for file in sorted(os.listdir(path)):
 | 
						|
                with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
 | 
						|
                    # Create a DataFrame for each file and add it to the list
 | 
						|
                    data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]}))
 | 
						|
    # Concatenate all DataFrame chunks together
 | 
						|
    df = pd.concat(data_frames, ignore_index=True)
 | 
						|
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)  # Shuffle the DataFrame
 | 
						|
    return df
 | 
						|
 | 
						|
 | 
						|
def partition_and_save(df, sizes=(35000, 5000, 10000)):
 | 
						|
    # Shuffle the DataFrame
 | 
						|
    df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
 | 
						|
 | 
						|
    # Get indices for where to split the data
 | 
						|
    train_end = sizes[0]
 | 
						|
    val_end = sizes[0] + sizes[1]
 | 
						|
 | 
						|
    # Split the DataFrame
 | 
						|
    train = df_shuffled.iloc[:train_end]
 | 
						|
    val = df_shuffled.iloc[train_end:val_end]
 | 
						|
    test = df_shuffled.iloc[val_end:]
 | 
						|
 | 
						|
    # Save to CSV files
 | 
						|
    train.to_csv("train.csv", index=False)
 | 
						|
    val.to_csv("validation.csv", index=False)
 | 
						|
    test.to_csv("test.csv", index=False)
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 | 
						|
    print("Downloading dataset ...")
 | 
						|
    download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb")
 | 
						|
    print("Creating data frames ...")
 | 
						|
    df = load_dataset_to_dataframe()
 | 
						|
    print("Partitioning and saving data frames ...")
 | 
						|
    partition_and_save(df)
 |