diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py index c7966ce..9c2fdfa 100644 --- a/appendix-D/01_main-chapter-code/previous_chapters.py +++ b/appendix-D/01_main-chapter-code/previous_chapters.py @@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = self.tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 0962797..a57af93 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -1273,7 +1273,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", + " token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", diff --git a/ch02/01_main-chapter-code/dataloader.ipynb b/ch02/01_main-chapter-code/dataloader.ipynb index 5a916a9..10978ef 100644 --- a/ch02/01_main-chapter-code/dataloader.ipynb +++ b/ch02/01_main-chapter-code/dataloader.ipynb @@ -48,7 +48,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", + " token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", @@ -150,7 +150,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/ch02/01_main-chapter-code/exercise-solutions.ipynb b/ch02/01_main-chapter-code/exercise-solutions.ipynb index d833ac4..5efd45e 100644 --- a/ch02/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb @@ -256,7 +256,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = self.tokenizer.encode(txt)\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", @@ -377,7 +377,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb index ac546b9..51579b8 100644 --- a/ch03/01_main-chapter-code/multihead-attention.ipynb +++ b/ch03/01_main-chapter-code/multihead-attention.ipynb @@ -78,7 +78,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", + " token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", @@ -374,7 +374,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.10" } }, "nbformat": 4, diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index 45e8c5b..d7e9e8a 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = self.tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py index f4f18dc..197cdb7 100644 --- a/ch04/01_main-chapter-code/previous_chapters.py +++ b/ch04/01_main-chapter-code/previous_chapters.py @@ -16,7 +16,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = self.tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch05/01_main-chapter-code/previous_chapters.py b/ch05/01_main-chapter-code/previous_chapters.py index 35fbf6d..3da8ba1 100644 --- a/ch05/01_main-chapter-code/previous_chapters.py +++ b/ch05/01_main-chapter-code/previous_chapters.py @@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = self.tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch05/02_alternative_weight_loading/previous_chapters.py b/ch05/02_alternative_weight_loading/previous_chapters.py index 4188f5b..75a3af0 100644 --- a/ch05/02_alternative_weight_loading/previous_chapters.py +++ b/ch05/02_alternative_weight_loading/previous_chapters.py @@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = self.tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py index 38edd4a..5e22a26 100644 --- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py +++ b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py @@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset): self.input_ids = [] self.target_ids = [] - token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'}) + token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'}) for i in range(0, len(token_ids) - max_length, stride): input_chunk = token_ids[i:i + max_length] diff --git a/ch05/05_bonus_hparam_tuning/previous_chapters.py b/ch05/05_bonus_hparam_tuning/previous_chapters.py index 169630c..c5c6c1c 100644 --- a/ch05/05_bonus_hparam_tuning/previous_chapters.py +++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py @@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = self.tokenizer.encode(txt) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride):