remove all non-English texts and notice (#304)

* remove all non-English texts and notice

1. almost 18GB txt left after `is_english` filtered.
2. remove notice use gutenberg's strip_headers
3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder.

* some improvements

* update readme

---------

Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
TITC 2024-08-10 06:09:14 +08:00 committed by GitHub
parent f1c3d451fe
commit 09a3a73f2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 24 additions and 5 deletions

View File

@ -82,11 +82,18 @@ Next, run the `prepare_dataset.py` script, which concatenates the (as of this wr
```bash ```bash
python prepare_dataset.py \ python prepare_dataset.py \
--data_dir gutenberg/data \ --data_dir gutenberg/data/raw \
--max_size_mb 500 \ --max_size_mb 500 \
--output_dir gutenberg_preprocessed --output_dir gutenberg_preprocessed
``` ```
```
...
Skipping gutenberg/data/raw/PG29836_raw.txt as it does not contain primarily English text. Skipping gutenberg/data/raw/PG16527_raw.txt as it does not contain primarily English text. 100%|██████████████████████████████████████████████████████████| 57250/57250 [25:04<00:00, 38.05it/s]
42 file(s) saved in /Users/sebastian/Developer/LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
```
> [!TIP] > [!TIP]
> Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information. > Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information.

View File

@ -10,6 +10,13 @@ Script that processes the Project Gutenberg files into fewer larger files.
import argparse import argparse
import os import os
import re import re
from tqdm import tqdm
from gutenberg.src.cleanup import strip_headers
def is_english(text, threshold=0.9):
ascii_chars = sum(1 for c in text if ord(c) < 128)
return ascii_chars / len(text) > threshold
def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"): def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
@ -20,16 +27,21 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
current_size = 0 current_size = 0
file_counter = 1 file_counter = 1
for file_path in file_paths: for file_path in tqdm(file_paths):
try: try:
with open(file_path, "r", encoding="utf-8") as file: with open(file_path, "r", encoding="utf-8") as file:
content = file.read() content = file.read()
except UnicodeDecodeError: except UnicodeDecodeError:
# Attempt to read the file with a fallback encoding # Attempt to read the file with a fallback encoding
print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}") tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
with open(file_path, "r", encoding=fallback_encoding) as file: with open(file_path, "r", encoding=fallback_encoding) as file:
content = file.read() content = file.read()
if not is_english(content):
tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
continue
content = strip_headers(content)
# Regular expression to replace multiple blank lines with a single blank line # Regular expression to replace multiple blank lines with a single blank line
content = re.sub(r'\n\s*\n', '\n\n', content) content = re.sub(r'\n\s*\n', '\n\n', content)
estimated_size = len(content.encode("utf-8")) estimated_size = len(content.encode("utf-8"))
@ -56,7 +68,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining") parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
parser.add_argument("--data_dir", type=str, default="gutenberg/data", parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
help="Directory containing the downloaded raw training data") help="Directory containing the downloaded raw training data")
parser.add_argument("--max_size_mb", type=int, default=500, parser.add_argument("--max_size_mb", type=int, default=500,
help="The maximum file size for each concatenated file in megabytes") help="The maximum file size for each concatenated file in megabytes")
@ -66,7 +78,7 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir) all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path] for name in files if name.endswith((".txt", ".txt.utf8"))]
print(f"{len(all_files)} file(s) to process.") print(f"{len(all_files)} file(s) to process.")
file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb) file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)