LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py

# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

"""
Script that processes the Project Gutenberg files into fewer larger files.
"""

import argparse
import os
import re
from tqdm import tqdm
from gutenberg.src.cleanup import strip_headers


def is_english(text, threshold=0.9):
    ascii_chars = sum(1 for c in text if ord(c) < 128)
    return ascii_chars / len(text) > threshold


def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    current_content = []
    current_size = 0
    file_counter = 1

    for file_path in tqdm(file_paths):
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
        except UnicodeDecodeError:
            # Attempt to read the file with a fallback encoding
            tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
            with open(file_path, "r", encoding=fallback_encoding) as file:
                content = file.read()

        if not is_english(content):
            tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
            continue
        content = strip_headers(content)

        # Regular expression to replace multiple blank lines with a single blank line
        content = re.sub(r'\n\s*\n', '\n\n', content)
        estimated_size = len(content.encode("utf-8"))

        if current_size + estimated_size > max_size_mb * 1024 * 1024:
            target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
            with open(target_file_path, "w", encoding="utf-8") as target_file:
                target_file.write(separator.join(current_content))
            file_counter += 1
            current_content = [content]
            current_size = estimated_size
        else:
            current_content.append(content)
            current_size += estimated_size

    if current_content:
        target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
        with open(target_file_path, "w", encoding="utf-8") as target_file:
            target_file.write(separator.join(current_content))
    return file_counter


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")

    parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
                        help="Directory containing the downloaded raw training data")
    parser.add_argument("--max_size_mb", type=int, default=500,
                        help="The maximum file size for each concatenated file in megabytes")
    parser.add_argument("--output_dir", type=str, default="gutenberg_preprocessed",
                        help="Directory where the preprocessed data will be saved")

    args = parser.parse_args()

    all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
                 for name in files if name.endswith((".txt", ".txt.utf8"))]

    print(f"{len(all_files)} file(s) to process.")
    file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)
    print(f"{file_counter} file(s) saved in {os.path.abspath(args.output_dir)}")
Ch05 supplementary code (#81) 2024-03-19 09:26:26 -05:00			`# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).`
			`# Source for "Build a Large Language Model From Scratch"`
			`# - https://www.manning.com/books/build-a-large-language-model-from-scratch`
			`# Code: https://github.com/rasbt/LLMs-from-scratch`

pretraining on project gutenberg 2024-03-13 08:34:39 -05:00			`"""`
			`Script that processes the Project Gutenberg files into fewer larger files.`
			`"""`

			`import argparse`
			`import os`
improve instructions 2024-04-02 07:12:22 -05:00			`import re`
remove all non-English texts and notice (#304) * remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com> 2024-08-10 06:09:14 +08:00			`from tqdm import tqdm`
			`from gutenberg.src.cleanup import strip_headers`


			`def is_english(text, threshold=0.9):`
			`ascii_chars = sum(1 for c in text if ord(c) < 128)`
			`return ascii_chars / len(text) > threshold`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00

			`def combine_files(file_paths, target_dir, max_size_mb=500, separator="<\|endoftext\|>", fallback_encoding="latin1"):`
			`if not os.path.exists(target_dir):`
			`os.makedirs(target_dir)`

			`current_content = []`
			`current_size = 0`
			`file_counter = 1`

remove all non-English texts and notice (#304) * remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com> 2024-08-10 06:09:14 +08:00			`for file_path in tqdm(file_paths):`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00			`try:`
			`with open(file_path, "r", encoding="utf-8") as file:`
			`content = file.read()`
			`except UnicodeDecodeError:`
			`# Attempt to read the file with a fallback encoding`
remove all non-English texts and notice (#304) * remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com> 2024-08-10 06:09:14 +08:00			`tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00			`with open(file_path, "r", encoding=fallback_encoding) as file:`
			`content = file.read()`

remove all non-English texts and notice (#304) * remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com> 2024-08-10 06:09:14 +08:00			`if not is_english(content):`
			`tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")`
			`continue`
			`content = strip_headers(content)`

improve instructions 2024-04-02 07:12:22 -05:00			`# Regular expression to replace multiple blank lines with a single blank line`
			`content = re.sub(r'\n\s*\n', '\n\n', content)`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00			`estimated_size = len(content.encode("utf-8"))`

			`if current_size + estimated_size > max_size_mb * 1024 * 1024:`
			`target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")`
			`with open(target_file_path, "w", encoding="utf-8") as target_file:`
			`target_file.write(separator.join(current_content))`
			`file_counter += 1`
			`current_content = [content]`
			`current_size = estimated_size`
			`else:`
			`current_content.append(content)`
			`current_size += estimated_size`

			`if current_content:`
			`target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")`
			`with open(target_file_path, "w", encoding="utf-8") as target_file:`
			`target_file.write(separator.join(current_content))`
improve instructions 2024-04-02 07:12:22 -05:00			`return file_counter`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00

			`if __name__ == "__main__":`

improve instructions 2024-04-02 07:12:22 -05:00			`parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00
remove all non-English texts and notice (#304) * remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com> 2024-08-10 06:09:14 +08:00			`parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00			`help="Directory containing the downloaded raw training data")`
			`parser.add_argument("--max_size_mb", type=int, default=500,`
			`help="The maximum file size for each concatenated file in megabytes")`
			`parser.add_argument("--output_dir", type=str, default="gutenberg_preprocessed",`
			`help="Directory where the preprocessed data will be saved")`

			`args = parser.parse_args()`

			`all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)`
remove all non-English texts and notice (#304) * remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com> 2024-08-10 06:09:14 +08:00			`for name in files if name.endswith((".txt", ".txt.utf8"))]`
pretraining on project gutenberg 2024-03-13 08:34:39 -05:00
improve instructions 2024-04-02 07:12:22 -05:00			`print(f"{len(all_files)} file(s) to process.")`
Use max size properly 2024-04-02 13:29:23 -05:00			`file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)`
improve instructions 2024-04-02 07:12:22 -05:00			`print(f"{file_counter} file(s) saved in {os.path.abspath(args.output_dir)}")`