From f30dd2dd2be76fa2d52fc84e7b9abadc073858cf Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Tue, 2 Apr 2024 07:12:22 -0500
Subject: [PATCH] improve instructions

---
 ch05/03_bonus_pretraining_on_gutenberg/README.md    |  8 ++++----
 .../prepare_dataset.py                              | 13 ++++++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/ch05/03_bonus_pretraining_on_gutenberg/README.md b/ch05/03_bonus_pretraining_on_gutenberg/README.md
index b5cd141..2ea0f3b 100644
--- a/ch05/03_bonus_pretraining_on_gutenberg/README.md
+++ b/ch05/03_bonus_pretraining_on_gutenberg/README.md
@@ -34,10 +34,10 @@ Follow these steps to download the dataset:
 Next, run the `prepare_dataset.py` script, which concatenates the (as of this writing, 60,173) text files into fewer larger files so that they can be more efficiently transferred and accessed:
 
 ```
-prepare_dataset.py \
-  --data_dir "gutenberg/data" \
+python prepare_dataset.py \
+  --data_dir gutenberg/data \
   --max_size_mb 500 \
-  --output_dir "gutenberg_preprocessed"
+  --output_dir gutenberg_preprocessed
 ```
 
 > [!TIP] 
@@ -53,7 +53,7 @@ prepare_dataset.py \
 You can run the pretraining script as follows. Note that the additional command line arguments are shown with the default values for illustration purposes:
 
 ```bash
-pretraining_simple.py \
+python pretraining_simple.py \
   --data_dir "gutenberg_preprocessed" \
   --n_epochs 1 \
   --batch_size 4 \
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
index 7ab5df4..df66ecf 100644
--- a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
@@ -9,6 +9,7 @@ Script that processes the Project Gutenberg files into fewer larger files.
 
 import argparse
 import os
+import re
 
 
 def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftext|>", fallback_encoding="latin1"):
@@ -29,6 +30,8 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
             with open(file_path, "r", encoding=fallback_encoding) as file:
                 content = file.read()
 
+        # Regular expression to replace multiple blank lines with a single blank line
+        content = re.sub(r'\n\s*\n', '\n\n', content)
         estimated_size = len(content.encode("utf-8"))
 
         if current_size + estimated_size > max_size_mb * 1024 * 1024:
@@ -46,11 +49,12 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
         target_file_path = os.path.join(target_dir, f"combined_{file_counter}.txt")
         with open(target_file_path, "w", encoding="utf-8") as target_file:
             target_file.write(separator.join(current_content))
+    return file_counter
 
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description="GPT Model Training Configuration")
+    parser = argparse.ArgumentParser(description="Preprocess and combine text files for pretraining")
 
     parser.add_argument("--data_dir", type=str, default="gutenberg/data",
                         help="Directory containing the downloaded raw training data")
@@ -64,7 +68,6 @@ if __name__ == "__main__":
     all_files = [os.path.join(path, name) for path, subdirs, files in os.walk(args.data_dir)
                  for name in files if name.endswith((".txt", ".txt.utf8")) and "raw" not in path]
 
-    target_dir = "path_to_your_large_files"
-    print(f"{len(all_files)} files to process.")
-
-    combine_files(all_files, args.output_dir)
+    print(f"{len(all_files)} file(s) to process.")
+    file_counter = combine_files(all_files, args.output_dir)
+    print(f"{file_counter} file(s) saved in {os.path.abspath(args.output_dir)}")