From 0bdcce4e40d1bae0ad29be27baccecda4b7bdfab Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Sun, 30 Mar 2025 16:01:37 -0500
Subject: [PATCH] Clarify dataset length in chapter 2 (#589)

---
 ch02/01_main-chapter-code/ch02.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb
index 06b55ae..938795c 100644
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -1296,6 +1296,7 @@
     "\n",
     "        # Tokenize the entire text\n",
     "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
+    "        assert len(token_ids) > max_length, \"Number of tokenized inputs must at least be equal to max_length+1\"\n",
     "\n",
     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
     "        for i in range(0, len(token_ids) - max_length, stride):\n",