From 8d84800bcff01217194f55d8dd778e39df996002 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 29 Apr 2024 21:56:05 -0500
Subject: [PATCH] use training set len (#137)

---
 .../02_bonus_additional-experiments/README.md | 34 +++++++++----------
 .../additional-experiments.py                 | 10 ++++--
 .../download-prepare-dataset.py               |  2 +-
 .../03_bonus_imdb-classification/train-gpt.py | 12 +++++--
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/ch06/02_bonus_additional-experiments/README.md b/ch06/02_bonus_additional-experiments/README.md
index 47a215e..59fa201 100644
--- a/ch06/02_bonus_additional-experiments/README.md
+++ b/ch06/02_bonus_additional-experiments/README.md
@@ -9,16 +9,16 @@ For example,
 
 &nbsp;
 
-|   | Model              | Weights    | Trainable token | Trainable layers | Context length          | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
-|---|--------------------|------------|-----------------|------------------|-------------------------|---------|---------------|--------------|----------------|----------|
-| 1 | gpt2-small (124M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.39 min      | 96.63%       | 97.99%         | 94.33%   |
-| 2 | gpt2-small (124M)  | pretrained | first           | last_block       | longest train ex. (120) | V100    | 0.37 min      | 78.46%       | 80.54%         | 75.00%   |
-| 3 | gpt2-small (124M)  | pretrained | last            | last_layer       | longest train ex. (120) | V100    | 0.33 min      | 78.65%       | 87.25%         | 78.33%   |
-| 4 | gpt2-small (124M)  | pretrained | last            | all              | longest train ex. (120) | V100    | 0.94 min      | 99.62%       | 96.64%         | 96.33%   |
-| 5 | gpt2-medium (355M) | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.91 min      | 87.50%       | 51.01%         | 56.67%   |
-| 6 | gpt2-large (774M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 1.91 min      | 99.52%       | 98.66%         | 96.67%   |
-| 7 | gpt2-small (124M)  | random     | last            | all              | longest train ex. (120) | V100    | 0.93 min      | 100%         | 97.32%         | 93.00%   |
-| 8 | gpt2-small (124M)  | pretrained | last            | last_block       | context length (1024)   | V100    | 3.24 min      | 83.08%       | 87.92%         | 78.33%   |
+|      | Model              | Weights    | Trainable token | Trainable layers | Context length          | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
+| ---- | ------------------ | ---------- | --------------- | ---------------- | ----------------------- | ------- | ------------- | ------------ | -------------- | -------- |
+| 1    | gpt2-small (124M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.39 min      | 96.63%       | 99.33%         | 95.00%   |
+| 2    | gpt2-small (124M)  | pretrained | first           | last_block       | longest train ex. (120) | V100    | 0.37 min      | 78.46%       | 80.54%         | 75.00%   |
+| 3    | gpt2-small (124M)  | pretrained | last            | last_layer       | longest train ex. (120) | V100    | 0.33 min      | 78.65%       | 79.87%         | 72.00%   |
+| 4    | gpt2-small (124M)  | pretrained | last            | all              | longest train ex. (120) | V100    | 0.94 min      | 99.62%       | 96.64%         | 96.67%   |
+| 5    | gpt2-medium (355M) | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.91 min      | 87.50%       | 91.28%         | 84.67%   |
+| 6    | gpt2-large (774M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 1.91 min      | 99.52%       | 98.66%         | 96.67%   |
+| 7    | gpt2-small (124M)  | random     | last            | all              | longest train ex. (120) | V100    | 0.93 min      | 100%         | 96.64%         | 93.67%   |
+| 8    | gpt2-small (124M)  | pretrained | last            | last_block       | context length (1024)   | V100    | 3.24 min      | 83.08%       | 87.92%         | 78.33%   |
 
 &nbsp;
 
@@ -30,24 +30,24 @@ You can use the following code to reproduce the experiments:
 - Row 2: `python additional-experiments.py --trainable_token first` 
 - Row 3: `python additional-experiments.py --trainable_layers last_layer`
 - Row 4: `python additional-experiments.py --trainable_layers all`
-- Row 5: `python additional-experiments.py --model_size gpt2-medium (355M)`
-- Row 6: `python additional-experiments.py --model_size gpt2-large (774M)`
+- Row 5: `python additional-experiments.py --model_size "gpt2-medium (355M)"`
+- Row 6: `python additional-experiments.py --model_size "gpt2-large (774M)"`
 - Row 7: `python additional-experiments.py --weights random --trainable_layers all`
 - Row 8: `python additional-experiments.py --context_length "model_context_length"`
 
 I've kept the LLM and dataset small on purpose, so you can run the training on a regular laptop like a MacBook Air M3 in about 15 minutes in case you don't have access to a GPU.
-  
+
 &nbsp;
 
 ### Interpretation
 
-1. **Training the Last vs. First Output Token (Row 1 vs. 2)**: Training the last output token results in significantly better performance compared to the first. This improvement is expected due to the causal self-attention mask.
+1. **Training the Last vs. First Output Token (Row 1 vs. 2)**: Training the last output token results in substantially better performance compared to the first. This improvement is expected due to the causal self-attention mask.
 
-2. **Training the Last Transformer Block vs. Last Layer (Row 1 vs. 3)**: Training the entire last transformer block is much more effective than training only the last layer.
+2. **Training the Last Transformer Block vs. Last Layer (Row 1 vs. 3)**: Training the entire last transformer block is also results in substantially better results than training only the last layer.
 
-3. **Training All Layers vs. Last Transformer Block (Row 1 vs. 4)**: Training all layers shows a modest improvement of 2% over just training the last transformer block, but it requires almost three times longer in terms of training duration.
+3. **Training All Layers vs. Last Transformer Block (Row 1 vs. 4)**: Training all layers shows a modest improvement of ~2% over just training the last transformer block, but it requires almost three times longer in terms of training duration.
 
-4. **Using Larger Pretrained Models (Row 1 vs 5, and Row 1 vs. 6)**: Employing a 3x larger pretrained model leads to worse results. However, using a 5x larger model improves performance compared to the initial model, as was anticipated.
+4. **Using Larger Pretrained Models (Row 1 vs 5, and Row 1 vs. 6)**: Employing a 3x larger pretrained model leads to worse results. However, using a 5x larger model improves performance compared to the initial model, as was anticipated. (The medium model was perhaps not well pretrained or the particular finetuning configuration works not as well for this model.)
 
 5. **Using a Model with Random Weights vs. Pretrained Weights (Row 1 vs. 7)**: Utilizing a model with random weights yields results that are only slightly worse by 1.3% compared to using pretrained weights.
 
diff --git a/ch06/02_bonus_additional-experiments/additional-experiments.py b/ch06/02_bonus_additional-experiments/additional-experiments.py
index b9e824d..81809a2 100644
--- a/ch06/02_bonus_additional-experiments/additional-experiments.py
+++ b/ch06/02_bonus_additional-experiments/additional-experiments.py
@@ -123,6 +123,9 @@ def instantiate_model(choose_model, load_weights):
     }
 
     BASE_CONFIG.update(model_configs[choose_model])
+
+    if not load_weights:
+        torch.manual_seed(123)
     model = GPTModel(BASE_CONFIG)
 
     if load_weights:
@@ -354,17 +357,20 @@ if __name__ == "__main__":
 
     tokenizer = tiktoken.get_encoding("gpt2")
 
+    train_dataset = None
     if args.context_length == "model_context_length":
         max_length = model.pos_emb.weight.shape[0]
     elif args.context_length == "longest_training_example":
-        max_length = None
+        train_dataset = SpamDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer)
+        max_length = train_dataset.max_length
     else:
         try:
             max_length = int(args.context_length)
         except ValueError:
             raise ValueError("Invalid --context_length argument")
 
-    train_dataset = SpamDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
+    if train_dataset is None:
+        train_dataset = SpamDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
     val_dataset = SpamDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer)
     test_dataset = SpamDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer)
 
diff --git a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py
index 28197e6..e3e60b4 100644
--- a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py
+++ b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py
@@ -68,7 +68,7 @@ def partition_and_save(df, sizes=(35000, 5000, 10000)):
 
     # Save to CSV files
     train.to_csv("train.csv", index=False)
-    val.to_csv("val.csv", index=False)
+    val.to_csv("validation.csv", index=False)
     test.to_csv("test.csv", index=False)
 
 
diff --git a/ch06/03_bonus_imdb-classification/train-gpt.py b/ch06/03_bonus_imdb-classification/train-gpt.py
index dda708b..65da198 100644
--- a/ch06/03_bonus_imdb-classification/train-gpt.py
+++ b/ch06/03_bonus_imdb-classification/train-gpt.py
@@ -67,6 +67,9 @@ def instantiate_model(choose_model, load_weights):
     }
 
     BASE_CONFIG.update(model_configs[choose_model])
+
+    if not load_weights:
+        torch.manual_seed(123)
     model = GPTModel(BASE_CONFIG)
 
     if load_weights:
@@ -294,18 +297,21 @@ if __name__ == "__main__":
 
     tokenizer = tiktoken.get_encoding("gpt2")
 
+    train_dataset = None
     if args.context_length == "model_context_length":
         max_length = model.pos_emb.weight.shape[0]
     elif args.context_length == "longest_training_example":
-        max_length = None
+        train_dataset = IMDBDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer)
+        max_length = train_dataset.max_length
     else:
         try:
             max_length = int(args.context_length)
         except ValueError:
             raise ValueError("Invalid --context_length argument")
 
-    train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
-    val_dataset = IMDBDataset(base_path / "val.csv", max_length=max_length, tokenizer=tokenizer)
+    if train_dataset is None:
+        train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
+    val_dataset = IMDBDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer)
     test_dataset = IMDBDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer)
 
     num_workers = 0