diff --git a/ch02/05_bpe-from-scratch/tests.py b/ch02/05_bpe-from-scratch/tests.py index e82e8cd..254c614 100644 --- a/ch02/05_bpe-from-scratch/tests.py +++ b/ch02/05_bpe-from-scratch/tests.py @@ -5,8 +5,6 @@ import nbformat import types import pytest -import tiktoken - def import_definitions_from_notebook(fullname, names): """Loads function definitions from a Jupyter notebook file into a module.""" @@ -47,17 +45,15 @@ def verdict_file(imported_module): """Fixture to handle downloading The Verdict file.""" download_file_if_absent = getattr(imported_module, "download_file_if_absent", None) - verdict_path = download_file_if_absent( - url=( - "https://raw.githubusercontent.com/rasbt/" - "LLMs-from-scratch/main/ch02/01_main-chapter-code/" - "the-verdict.txt" - ), - filename="the-verdict.txt", - search_dirs=["ch02/01_main-chapter-code/", "../01_main-chapter-code/", "."] - ) + search_directories = [".", "../02_bonus_bytepair-encoder/gpt2_model/"] + files_to_download = { + "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe": "vocab.bpe", + "https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json": "encoder.json" + } + paths = {filename: download_file_if_absent(url, filename, search_directories) + for url, filename in files_to_download.items()} - return verdict_path + return paths @pytest.fixture(scope="module") @@ -98,6 +94,7 @@ def test_tokenizer_training(imported_module, verdict_file): tokenizer2.load_vocab_and_merges(vocab_path="vocab.json", bpe_merges_path="bpe_merges.txt") assert tokenizer2.decode(token_ids) == input_text, "Decoded text mismatch after reloading tokenizer." + def test_gpt2_tokenizer_openai_simple(imported_module, gpt2_files): BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None) diff --git a/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py b/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py index 29db397..746cc7f 100644 --- a/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py +++ b/ch05/10_llm-training-speed/02_opt_multi_gpu_ddp.py @@ -312,7 +312,7 @@ def generate_and_print_sample(model, device, start_context): def train_model_simple_with_timing(model, train_loader, val_loader, optimizer, device, - num_epochs, eval_freq, eval_iter, start_context, tokenizer): + num_epochs, eval_freq, eval_iter, start_context): train_losses, val_losses, track_tokens = [], [], [] total_tokens, global_step, last_tokens = 0, -1, 0 @@ -524,8 +524,6 @@ def main(gpt_config, settings, rank, world_size): # Train model ############################## - tokenizer = tiktoken.get_encoding("gpt2") - train_losses, val_losses, tokens_seen = train_model_simple_with_timing( model=model, train_loader=train_loader, @@ -536,7 +534,6 @@ def main(gpt_config, settings, rank, world_size): eval_freq=5, eval_iter=1, start_context="Every effort moves you", - tokenizer=tokenizer ) # NEW: Clean up distributed processes diff --git a/ch06/01_main-chapter-code/gpt_class_finetune.py b/ch06/01_main-chapter-code/gpt_class_finetune.py index 8308304..239f374 100644 --- a/ch06/01_main-chapter-code/gpt_class_finetune.py +++ b/ch06/01_main-chapter-code/gpt_class_finetune.py @@ -175,7 +175,7 @@ def evaluate_model(model, train_loader, val_loader, device, eval_iter): def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, - eval_freq, eval_iter, tokenizer): + eval_freq, eval_iter): # Initialize lists to track losses and tokens seen train_losses, val_losses, train_accs, val_accs = [], [], [], [] examples_seen, global_step = 0, -1 @@ -408,7 +408,6 @@ if __name__ == "__main__": train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple( model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=50, eval_iter=5, - tokenizer=tokenizer ) end_time = time.time()