From e1adeb14f3d5b01b85fcd770334ec4060fe883c5 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sun, 9 Jun 2024 06:04:02 -0500 Subject: [PATCH] add allowed_special={"<|endoftext|>"} --- ch02/01_main-chapter-code/exercise-solutions.ipynb | 4 ++-- ch04/01_main-chapter-code/gpt.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ch02/01_main-chapter-code/exercise-solutions.ipynb b/ch02/01_main-chapter-code/exercise-solutions.ipynb index bfaa1f8..d7dc38e 100644 --- a/ch02/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb @@ -264,7 +264,7 @@ " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", - " token_ids = tokenizer.encode(txt)\n", + " token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", @@ -385,7 +385,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index b2d985b..5066012 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -18,7 +18,7 @@ class GPTDatasetV1(Dataset): self.target_ids = [] # Tokenize the entire text - token_ids = tokenizer.encode(txt) + token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"}) # Use a sliding window to chunk the book into overlapping sequences of max_length for i in range(0, len(token_ids) - max_length, stride):