2024-03-19 09:26:26 -05:00
|
|
|
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
|
|
|
# Source for "Build a Large Language Model From Scratch"
|
|
|
|
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
|
|
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
|
|
|
|
2024-03-13 08:34:39 -05:00
|
|
|
"""
|
|
|
|
Script that processes the Project Gutenberg files into fewer larger files.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import os
|
2024-04-02 07:12:22 -05:00
|
|
|
import re
|
2024-08-10 06:09:14 +08:00
|
|
|
from tqdm import tqdm
|
|
|
|
from gutenberg.src.cleanup import strip_headers
|
|
|
|
|
|
|
|
|
|
|
|
def is_english(text, threshold=0.9):
|
|
|
|
ascii_chars = sum(1 for c in text if ord(c) < 128)
|
|
|
|
return ascii_chars / len(text) > threshold
|
2024-03-13 08:34:39 -05:00
|
|
|
|
|
|
|
|
|
|
|
def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
|
|
|
|
if not os.path.exists(target_dir):
|
|
|
|
os.makedirs(target_dir)
|
|
|
|
|
|
|
|
current_content = []
|
|
|
|
current_size = 0
|
|
|
|
file_counter = 1
|
|
|
|
|
2024-08-10 06:09:14 +08:00
|
|
|
for file_path in tqdm(file_paths):
|
2024-03-13 08:34:39 -05:00
|
|
|
try:
|
|
|
|
with open(file_path, "r", encoding="utf-8") as file:
|
|
|
|
content = file.read()
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
# Attempt to read the file with a fallback encoding
|
2024-08-10 06:09:14 +08:00
|
|
|
tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
|
2024-03-13 08:34:39 -05:00
|
|
|
with open(file_path, "r", encoding=fallback_encoding) as file:
|
|
|
|
content = file.read()
|
|
|
|
|
2024-08-10 06:09:14 +08:00
|
|
|
if not is_english(content):
|
|
|
|
tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
|
|
|
|
continue
|
|
|
|
content = strip_headers(content)
|
|
|
|
|
2024-04-02 07:12:22 -05:00
|
|
|
# Regular expression to replace multiple blank lines with a single blank line
|
|
|
|
content = re.sub(r'\n\s*\n', '\n\n', content)
|
2024-03-13 08:34:39 -05:00
|
|
|
estimated_size = len(content.encode("utf-8"))
|
|
|
|
|
|
|
|
if current_size + estimated_size > max_size_mb * 1024 * 1024:
|
|
|
|
target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
|
|
|
|
with open(target_file_path, "w", encoding="utf-8") as target_file:
|
|
|
|
target_file.write(separator.join(current_content))
|
|
|
|
file_counter += 1
|
|
|
|
current_content = [content]
|
|
|
|
current_size = estimated_size
|
|
|
|
else:
|
|
|
|
current_content.append(content)
|
|
|
|
current_size += estimated_size
|
|
|
|
|
|
|
|
if current_content:
|
|
|
|
target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
|
|
|
|
with open(target_file_path, "w", encoding="utf-8") as target_file:
|
|
|
|
target_file.write(separator.join(current_content))
|
2024-04-02 07:12:22 -05:00
|
|
|
return file_counter
|
2024-03-13 08:34:39 -05:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
2024-04-02 07:12:22 -05:00
|
|
|
parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
|
2024-03-13 08:34:39 -05:00
|
|
|
|
2024-08-10 06:09:14 +08:00
|
|
|
parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
|
2024-03-13 08:34:39 -05:00
|
|
|
help="Directory containing the downloaded raw training data")
|
|
|
|
parser.add_argument("--max_size_mb", type=int, default=500,
|
|
|
|
help="The maximum file size for each concatenated file in megabytes")
|
|
|
|
parser.add_argument("--output_dir", type=str, default="gutenberg_preprocessed",
|
|
|
|
help="Directory where the preprocessed data will be saved")
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
|
2024-08-10 06:09:14 +08:00
|
|
|
for name in files if name.endswith((".txt", ".txt.utf8"))]
|
2024-03-13 08:34:39 -05:00
|
|
|
|
2024-04-02 07:12:22 -05:00
|
|
|
print(f"{len(all_files)} file(s) to process.")
|
2024-04-02 13:29:23 -05:00
|
|
|
file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)
|
2024-04-02 07:12:22 -05:00
|
|
|
print(f"{file_counter} file(s) saved in {os.path.abspath(args.output_dir)}")
|