mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-19 06:02:38 +00:00
remove all non-English texts and notice (#304)
* remove all non-English texts and notice 1. almost 18GB txt left after `is_english` filtered. 2. remove notice use gutenberg's strip_headers 3. after re-run get_data.py, seems all data are under `gutenberg/data/.mirror` folder. * some improvements * update readme --------- Co-authored-by: rasbt <mail@sebastianraschka.com>
This commit is contained in:
parent
f1c3d451fe
commit
09a3a73f2d
@ -82,11 +82,18 @@ Next, run the `prepare_dataset.py` script, which concatenates the (as of this wr
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
python prepare_dataset.py \
|
python prepare_dataset.py \
|
||||||
--data_dir gutenberg/data \
|
--data_dir gutenberg/data/raw \
|
||||||
--max_size_mb 500 \
|
--max_size_mb 500 \
|
||||||
--output_dir gutenberg_preprocessed
|
--output_dir gutenberg_preprocessed
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
...
|
||||||
|
Skipping gutenberg/data/raw/PG29836_raw.txt as it does not contain primarily English text. Skipping gutenberg/data/raw/PG16527_raw.txt as it does not contain primarily English text. 100%|██████████████████████████████████████████████████████████| 57250/57250 [25:04<00:00, 38.05it/s]
|
||||||
|
42 file(s) saved in /Users/sebastian/Developer/LLMs-from-scratch/ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
> [!TIP]
|
> [!TIP]
|
||||||
> Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information.
|
> Note that the produced files are stored in plaintext format and are not pre-tokenized for simplicity. However, you may want to update the codes to store the dataset in a pre-tokenized form to save computation time if you are planning to use the dataset more often or train for multiple epochs. See the *Design Decisions and Improvements* at the bottom of this page for more information.
|
||||||
|
|
||||||
|
@ -10,6 +10,13 @@ Script that processes the Project Gutenberg files into fewer larger files.
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
from tqdm import tqdm
|
||||||
|
from gutenberg.src.cleanup import strip_headers
|
||||||
|
|
||||||
|
|
||||||
|
def is_english(text, threshold=0.9):
|
||||||
|
ascii_chars = sum(1 for c in text if ord(c) < 128)
|
||||||
|
return ascii_chars / len(text) > threshold
|
||||||
|
|
||||||
|
|
||||||
def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
|
def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
|
||||||
@ -20,16 +27,21 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
|
|||||||
current_size = 0
|
current_size = 0
|
||||||
file_counter = 1
|
file_counter = 1
|
||||||
|
|
||||||
for file_path in file_paths:
|
for file_path in tqdm(file_paths):
|
||||||
try:
|
try:
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
with open(file_path, "r", encoding="utf-8") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# Attempt to read the file with a fallback encoding
|
# Attempt to read the file with a fallback encoding
|
||||||
print(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
|
tqdm.write(f"Warning: UnicodeDecodeError encountered. Trying fallback encoding for {file_path}")
|
||||||
with open(file_path, "r", encoding=fallback_encoding) as file:
|
with open(file_path, "r", encoding=fallback_encoding) as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
|
|
||||||
|
if not is_english(content):
|
||||||
|
tqdm.write(f"Skipping {file_path} as it does not contain primarily English text.")
|
||||||
|
continue
|
||||||
|
content = strip_headers(content)
|
||||||
|
|
||||||
# Regular expression to replace multiple blank lines with a single blank line
|
# Regular expression to replace multiple blank lines with a single blank line
|
||||||
content = re.sub(r'\n\s*\n', '\n\n', content)
|
content = re.sub(r'\n\s*\n', '\n\n', content)
|
||||||
estimated_size = len(content.encode("utf-8"))
|
estimated_size = len(content.encode("utf-8"))
|
||||||
@ -56,7 +68,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
|
parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
|
||||||
|
|
||||||
parser.add_argument("--data_dir", type=str, default="gutenberg/data",
|
parser.add_argument("--data_dir", type=str, default="gutenberg/data/raw",
|
||||||
help="Directory containing the downloaded raw training data")
|
help="Directory containing the downloaded raw training data")
|
||||||
parser.add_argument("--max_size_mb", type=int, default=500,
|
parser.add_argument("--max_size_mb", type=int, default=500,
|
||||||
help="The maximum file size for each concatenated file in megabytes")
|
help="The maximum file size for each concatenated file in megabytes")
|
||||||
@ -66,7 +78,7 @@ if __name__ == "__main__":
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
|
all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
|
||||||
for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path]
|
for name in files if name.endswith((".txt", ".txt.utf8"))]
|
||||||
|
|
||||||
print(f"{len(all_files)} file(s) to process.")
|
print(f"{len(all_files)} file(s) to process.")
|
||||||
file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)
|
file_counter = combine_files(all_files, args.output_dir, max_size_mb=args.max_size_mb)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user