mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-08-15 04:01:44 +00:00
Fix Loss in Gutenberg bonus section (#109)
This commit is contained in:
parent
6de0417321
commit
adc2964fc5
@ -1081,10 +1081,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"def calc_loss_batch(input_batch, target_batch, model, device):\n",
|
"def calc_loss_batch(input_batch, target_batch, model, device):\n",
|
||||||
" input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
|
" input_batch, target_batch = input_batch.to(device), target_batch.to(device)\n",
|
||||||
"\n",
|
|
||||||
" logits = model(input_batch)\n",
|
" logits = model(input_batch)\n",
|
||||||
" logits = logits.flatten(0, 1)\n",
|
" loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())\n",
|
||||||
" loss = torch.nn.functional.cross_entropy(logits, target_batch.flatten())\n",
|
|
||||||
" return loss\n",
|
" return loss\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -2403,7 +2401,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.11.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -164,18 +164,32 @@ if __name__ == "__main__":
|
|||||||
help='Learning rate for the optimizer')
|
help='Learning rate for the optimizer')
|
||||||
parser.add_argument('--batch_size', type=int, default=4,
|
parser.add_argument('--batch_size', type=int, default=4,
|
||||||
help='Batch size for training')
|
help='Batch size for training')
|
||||||
|
parser.add_argument('--debug', type=bool, default=False,
|
||||||
|
help='Uses a very small model for debugging purposes')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
GPT_CONFIG_124M = {
|
if args.debug:
|
||||||
"vocab_size": 50257, # Vocabulary size
|
GPT_CONFIG_124M = {
|
||||||
"context_length": 1024, # Context length
|
"vocab_size": 50257, # Vocabulary size
|
||||||
"emb_dim": 768, # Embedding dimension
|
"context_length": 10, # Context length
|
||||||
"n_heads": 12, # Number of attention heads
|
"emb_dim": 12, # Embedding dimension
|
||||||
"n_layers": 12, # Number of layers
|
"n_heads": 2, # Number of attention heads
|
||||||
"drop_rate": 0.1, # Dropout rate
|
"n_layers": 2, # Number of layers
|
||||||
"qkv_bias": False # Query-key-value bias
|
"drop_rate": 0.0, # Dropout rate
|
||||||
}
|
"qkv_bias": False # Query-key-value bias
|
||||||
|
}
|
||||||
|
|
||||||
|
else:
|
||||||
|
GPT_CONFIG_124M = {
|
||||||
|
"vocab_size": 50257, # Vocabulary size
|
||||||
|
"context_length": 1024, # Context length
|
||||||
|
"emb_dim": 768, # Embedding dimension
|
||||||
|
"n_heads": 12, # Number of attention heads
|
||||||
|
"n_layers": 12, # Number of layers
|
||||||
|
"drop_rate": 0.1, # Dropout rate
|
||||||
|
"qkv_bias": False # Query-key-value bias
|
||||||
|
}
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
torch.manual_seed(123)
|
torch.manual_seed(123)
|
||||||
@ -210,8 +224,6 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
epochs_tensor = torch.linspace(0, args.n_epochs, len(train_losses))
|
epochs_tensor = torch.linspace(0, args.n_epochs, len(train_losses))
|
||||||
|
|
||||||
print("debug", epochs_tensor, tokens_seen, train_losses, val_losses, output_dir)
|
|
||||||
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, output_dir)
|
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, output_dir)
|
||||||
|
|
||||||
torch.save(model.state_dict(), output_dir / "model_pg_final.pth")
|
torch.save(model.state_dict(), output_dir / "model_pg_final.pth")
|
||||||
|
@ -244,7 +244,7 @@ def generate_text_simple(model, idx, max_new_tokens, context_size):
|
|||||||
def calc_loss_batch(input_batch, target_batch, model, device):
|
def calc_loss_batch(input_batch, target_batch, model, device):
|
||||||
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
|
||||||
logits = model(input_batch)
|
logits = model(input_batch)
|
||||||
loss = torch.nn.functional.cross_entropy(logits.flatten(0, -1), target_batch.flatten())
|
loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
32
ch05/03_bonus_pretraining_on_gutenberg/tests.py
Normal file
32
ch05/03_bonus_pretraining_on_gutenberg/tests.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||||
|
# Source for "Build a Large Language Model From Scratch"
|
||||||
|
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||||
|
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||||
|
|
||||||
|
# File for internal use (unit tests)
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def test_pretraining():
|
||||||
|
|
||||||
|
sequence = "a b c d"
|
||||||
|
repetitions = 1000
|
||||||
|
content = sequence * repetitions
|
||||||
|
|
||||||
|
folder_path = Path("gutenberg") / "data"
|
||||||
|
file_name = "repeated_sequence.txt"
|
||||||
|
|
||||||
|
os.makedirs(folder_path, exist_ok=True)
|
||||||
|
|
||||||
|
with open(folder_path/file_name, "w") as file:
|
||||||
|
file.write(content)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["python", "pretraining_simple.py", "--debug", "true"],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
print(result.stdout)
|
||||||
|
assert "Maximum GPU memory allocated" in result.stdout
|
Loading…
x
Reference in New Issue
Block a user