mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-11-16 18:14:40 +00:00
Clarify dataset length in chapter 2 (#589)
This commit is contained in:
parent
4e3b752e5e
commit
0bdcce4e40
@ -1296,6 +1296,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
|
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
|
||||||
|
" assert len(token_ids) > max_length, \"Number of tokenized inputs must at least be equal to max_length+1\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user