Fix Loss in Gutenberg bonus section (#109)

This commit is contained in:
Sebastian Raschka 2024-04-04 20:54:09 -05:00 committed by GitHub
parent 6de0417321
commit adc2964fc5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 58 additions and 16 deletions

View File

@ -1081,10 +1081,8 @@
"source": [ "source": [
"def calc_loss_batch(input_batch, target_batch, model, device):\n", "def calc_loss_batch(input_batch, target_batch, model, device):\n",
" input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n", " input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
"\n",
" logits = model(input_batch)\n", " logits = model(input_batch)\n",
" logits = logits.flatten(0, 1)\n", " loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
" loss = torch.nn.functional.cross_entropy(logits, target_batch.flatten())\n",
" return loss\n", " return loss\n",
"\n", "\n",
"\n", "\n",
@ -2403,7 +2401,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.11.4"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -164,18 +164,32 @@ if __name__ == "__main__":
help='Learning rate for the optimizer') help='Learning rate for the optimizer')
parser.add_argument('--batch_size', type=int, default=4, parser.add_argument('--batch_size', type=int, default=4,
help='Batch size for training') help='Batch size for training')
parser.add_argument('--debug', type=bool, default=False,
help='Uses a very small model for debugging purposes')
args = parser.parse_args() args = parser.parse_args()
GPT_CONFIG_124M = { if args.debug:
"vocab_size": 50257, # Vocabulary size GPT_CONFIG_124M = {
"context_length": 1024, # Context length "vocab_size": 50257, # Vocabulary size
"emb_dim": 768, # Embedding dimension "context_length": 10, # Context length
"n_heads": 12, # Number of attention heads "emb_dim": 12, # Embedding dimension
"n_layers": 12, # Number of layers "n_heads": 2, # Number of attention heads
"drop_rate": 0.1, # Dropout rate "n_layers": 2, # Number of layers
"qkv_bias": False # Query-key-value bias "drop_rate": 0.0, # Dropout rate
} "qkv_bias": False # Query-key-value bias
}
else:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 1024, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-key-value bias
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(123) torch.manual_seed(123)
@ -210,8 +224,6 @@ if __name__ == "__main__":
) )
epochs_tensor = torch.linspace(0, args.n_epochs, len(train_losses)) epochs_tensor = torch.linspace(0, args.n_epochs, len(train_losses))
print("debug", epochs_tensor, tokens_seen, train_losses, val_losses, output_dir)
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, output_dir) plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, output_dir)
torch.save(model.state_dict(), output_dir / "model_pg_final.pth") torch.save(model.state_dict(), output_dir / "model_pg_final.pth")

View File

@ -244,7 +244,7 @@ def generate_text_simple(model, idx, max_new_tokens, context_size):
def calc_loss_batch(input_batch, target_batch, model, device): def calc_loss_batch(input_batch, target_batch, model, device):
input_batch, target_batch = input_batch.to(device), target_batch.to(device) input_batch, target_batch = input_batch.to(device), target_batch.to(device)
logits = model(input_batch) logits = model(input_batch)
loss = torch.nn.functional.cross_entropy(logits.flatten(0, -1), target_batch.flatten()) loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
return loss return loss

View File

@ -0,0 +1,32 @@
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch
# File for internal use (unit tests)
from pathlib import Path
import os
import subprocess
def test_pretraining():
sequence = "a b c d"
repetitions = 1000
content = sequence * repetitions
folder_path = Path("gutenberg") / "data"
file_name = "repeated_sequence.txt"
os.makedirs(folder_path, exist_ok=True)
with open(folder_path/file_name, "w") as file:
file.write(content)
result = subprocess.run(
["python", "pretraining_simple.py", "--debug", "true"],
capture_output=True, text=True
)
print(result.stdout)
assert "Maximum GPU memory allocated" in result.stdout