diff --git a/ch05/03_bonus_pretraining_on_gutenberg/README.md b/ch05/03_bonus_pretraining_on_gutenberg/README.md index bcbc16e..2f331ba 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/README.md +++ b/ch05/03_bonus_pretraining_on_gutenberg/README.md @@ -82,11 +82,18 @@ Next, run the `prepare_dataset.py` script, which concatenates the (as of this wr ```bash python prepare_dataset.py \ - --data_dir gutenberg/data \ + --data_dir gutenberg/data/raw \ --max_size_mb 500 \ --output_dir gutenberg_preprocessed ``` +``` +... +Skipping gutenberg/data/raw/PG29836_raw.txt as it does not contain primarily English text. Skipping gutenberg/data/raw/PG16527_raw.txt as it does not contain primarily English text. 100%|██████████████████████████████████████████████████████████| 57250/57250 [25:04<00:00, 38.05it/s] +42 file(s) saved in /Users/sebastian/Developer/LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed +``` + + > [!TIP] > Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information. diff --git a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py index a8fe5f8..4eafa3a 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py @@ -10,6 +10,13 @@ Script that processes the Project Gutenberg files into fewer larger files. import argparse import os import re +from tqdm import tqdm +from gutenberg.src.cleanup import strip_headers + + +def is_english(text, threshold=0.9): + ascii_chars = sum(1 for c in text if ord(c) < 128) + return ascii_chars / len(text) > threshold def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"): @@ -20,16 +27,21 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex current_size = 0 file_counter = 1 - for file_path in file_paths: + for file_path in tqdm(file_paths): try: with open(file_path, "r", encoding="utf-8") as file: content = file.read() except UnicodeDecodeError: # Attempt to read the file with a fallback encoding - print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}") + tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}") with open(file_path, "r", encoding=fallback_encoding) as file: content = file.read() + if not is_english(content): + tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.") + continue + content = strip_headers(content) + # Regular expression to replace multiple blank lines with a single blank line content = re.sub(r'\n\s*\n', '\n\n', content) estimated_size = len(content.encode("utf-8")) @@ -56,7 +68,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining") - parser.add_argument("--data_dir", type=str, default="gutenberg/data", + parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw", help="Directory containing the downloaded raw training data") parser.add_argument("--max_size_mb", type=int, default=500, help="The maximum file size for each concatenated file in megabytes") @@ -66,7 +78,7 @@ if __name__ == "__main__": args = parser.parse_args() all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir) - for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path] + for name in files if name.endswith((".txt", ".txt.utf8"))] print(f"{len(all_files)} file(s) to process.") file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)