remove all non-English texts and notice (#304)

* remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
2025-11-30 17:11:36 +00:00 · 2024-08-10 06:09:14 +08:00 · 2024-08-10 06:09:14 +08:00 · 09a3a73f2d
commit 09a3a73f2d
parent f1c3d451fe
2 changed files with 24 additions and 5 deletions
--- a/ch05/03_bonus_pretraining_on_gutenberg/README.md
+++ b/ch05/03_bonus_pretraining_on_gutenberg/README.md
@ -82,11 +82,18 @@ Next, run the `prepare_dataset.py` script, which concatenates the (as of this wr

 ```bash
 python prepare_dataset.py \
-  --data_dir gutenberg/data \
+  --data_dir gutenberg/data/raw \
  --max_size_mb 500 \
  --output_dir gutenberg_preprocessed
 ```

+```
+...
+Skipping gutenberg/data/raw/PG29836_raw.txt as it does not contain primarily English text.                                     Skipping gutenberg/data/raw/PG16527_raw.txt as it does not contain primarily English text.                                     100%|██████████████████████████████████████████████████████████| 57250/57250 [25:04<00:00, 38.05it/s]
+42 file(s) saved in /Users/sebastian/Developer/LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
+```
+
+
 > [!TIP] 
 > Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information.

--- a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
@ -10,6 +10,13 @@ Script that processes the Project Gutenberg files into fewer larger files.
 import argparse
 import os
 import re
+from tqdm import tqdm
+from gutenberg.src.cleanup import strip_headers
+
+
+def is_english(text, threshold=0.9):
+    ascii_chars = sum(1 for c in text if ord(c) < 128)
+    return ascii_chars / len(text) > threshold


 def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
@ -20,16 +27,21 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
    current_size = 0
    file_counter = 1

-    for file_path in file_paths:
+    for file_path in tqdm(file_paths):
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
        except UnicodeDecodeError:
            # Attempt to read the file with a fallback encoding
-            print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
+            tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
            with open(file_path, "r", encoding=fallback_encoding) as file:
                content = file.read()

+        if not is_english(content):
+            tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
+            continue
+        content = strip_headers(content)
+
        # Regular expression to replace multiple blank lines with a single blank line
        content = re.sub(r'\n\s*\n', '\n\n', content)
        estimated_size = len(content.encode("utf-8"))
@ -56,7 +68,7 @@ if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")

-    parser.add_argument("--data_dir", type=str, default="gutenberg/data",
+    parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
                        help="Directory containing the downloaded raw training data")
    parser.add_argument("--max_size_mb", type=int, default=500,
                        help="The maximum file size for each concatenated file in megabytes")
@ -66,7 +78,7 @@ if __name__ == "__main__":
    args = parser.parse_args()

    all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
-                 for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path]
+                 for name in files if name.endswith((".txt", ".txt.utf8"))]

    print(f"{len(all_files)} file(s) to process.")
    file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)