add allowed_special={"<|endoftext|>"}

2025-11-03 11:20:49 +00:00 · 2024-06-09 06:04:02 -05:00 · 2024-06-09 06:04:02 -05:00 · e1adeb14f3
commit e1adeb14f3
parent 40ba3a4068
2 changed files with 3 additions and 3 deletions
--- a/ch02/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb
@ -264,7 +264,7 @@
    "        self.target_ids = []\n",
    "\n",
    "        # Tokenize the entire text\n",
-    "        token_ids = tokenizer.encode(txt)\n",
+    "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
    "\n",
    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
    "        for i in range(0, len(token_ids) - max_length, stride):\n",
@ -385,7 +385,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.6"
  }
 },
 "nbformat": 4,
--- a/ch04/01_main-chapter-code/gpt.py
+++ b/ch04/01_main-chapter-code/gpt.py
@ -18,7 +18,7 @@ class GPTDatasetV1(Dataset):
        self.target_ids = []

        # Tokenize the entire text
-        token_ids = tokenizer.encode(txt)
+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):