diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py index 46030ef..47170a5 100644 --- a/appendix-D/01_main-chapter-code/previous_chapters.py +++ b/appendix-D/01_main-chapter-code/previous_chapters.py @@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/appendix-E/01_main-chapter-code/previous_chapters.py b/appendix-E/01_main-chapter-code/previous_chapters.py index f62ecbd..6270b33 100644 --- a/appendix-E/01_main-chapter-code/previous_chapters.py +++ b/appendix-E/01_main-chapter-code/previous_chapters.py @@ -28,12 +28,11 @@ from torch.utils.data import Dataset, DataLoader class GPTDatasetV1(Dataset): def __init__(self, txt, tokenizer, max_length, stride): - self.tokenizer = tokenizer self.input_ids = [] self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index bf1ca20..6349f86 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -1920,7 +1920,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/ch02/01_main-chapter-code/exercise-solutions.ipynb b/ch02/01_main-chapter-code/exercise-solutions.ipynb index 350f3c0..bfaa1f8 100644 --- a/ch02/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb @@ -248,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9", "metadata": {}, "outputs": [], @@ -260,12 +260,11 @@ "\n", "class GPTDatasetV1(Dataset):\n", " def __init__(self, txt, tokenizer, max_length, stride):\n", - " self.tokenizer = tokenizer\n", " self.input_ids = []\n", " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = self.tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt)\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", @@ -311,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "0128eefa-d7c8-4f76-9851-566dfa7c3745", "metadata": {}, "outputs": [ @@ -324,7 +323,7 @@ " [ 402, 271]])" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -341,7 +340,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c", "metadata": {}, "outputs": [ @@ -354,7 +353,7 @@ " [ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb index b788040..10a5422 100644 --- a/ch03/01_main-chapter-code/multihead-attention.ipynb +++ b/ch03/01_main-chapter-code/multihead-attention.ipynb @@ -82,12 +82,11 @@ "\n", "class GPTDatasetV1(Dataset):\n", " def __init__(self, txt, tokenizer, max_length, stride):\n", - " self.tokenizer = tokenizer\n", " self.input_ids = []\n", " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", + " token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py index 46d1e69..4a652bf 100644 --- a/ch04/01_main-chapter-code/previous_chapters.py +++ b/ch04/01_main-chapter-code/previous_chapters.py @@ -15,7 +15,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = self.tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch04/02_performance-analysis/previous_chapters.py b/ch04/02_performance-analysis/previous_chapters.py index 9b05743..b1063ca 100644 --- a/ch04/02_performance-analysis/previous_chapters.py +++ b/ch04/02_performance-analysis/previous_chapters.py @@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch05/01_main-chapter-code/previous_chapters.py b/ch05/01_main-chapter-code/previous_chapters.py index 9b05743..b1063ca 100644 --- a/ch05/01_main-chapter-code/previous_chapters.py +++ b/ch05/01_main-chapter-code/previous_chapters.py @@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch05/02_alternative_weight_loading/previous_chapters.py b/ch05/02_alternative_weight_loading/previous_chapters.py index 0646b20..2b1c5f2 100644 --- a/ch05/02_alternative_weight_loading/previous_chapters.py +++ b/ch05/02_alternative_weight_loading/previous_chapters.py @@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch05/05_bonus_hparam_tuning/previous_chapters.py b/ch05/05_bonus_hparam_tuning/previous_chapters.py index 1fa5502..67d6a4b 100644 --- a/ch05/05_bonus_hparam_tuning/previous_chapters.py +++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py @@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch06/01_main-chapter-code/previous_chapters.py b/ch06/01_main-chapter-code/previous_chapters.py index 4fc0f7e..feb6ab1 100644 --- a/ch06/01_main-chapter-code/previous_chapters.py +++ b/ch06/01_main-chapter-code/previous_chapters.py @@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader class GPTDatasetV1(Dataset): def __init__(self, txt, tokenizer, max_length, stride): - self.tokenizer = tokenizer self.input_ids = [] self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch06/02_bonus_additional-experiments/previous_chapters.py b/ch06/02_bonus_additional-experiments/previous_chapters.py index 66367c4..ead6a0f 100644 --- a/ch06/02_bonus_additional-experiments/previous_chapters.py +++ b/ch06/02_bonus_additional-experiments/previous_chapters.py @@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader class GPTDatasetV1(Dataset): def __init__(self, txt, tokenizer, max_length, stride): - self.tokenizer = tokenizer self.input_ids = [] self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride): diff --git a/ch06/03_bonus_imdb-classification/previous_chapters.py b/ch06/03_bonus_imdb-classification/previous_chapters.py index 4fc0f7e..884eb5c 100644 --- a/ch06/03_bonus_imdb-classification/previous_chapters.py +++ b/ch06/03_bonus_imdb-classification/previous_chapters.py @@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride):