mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-11-01 10:20:00 +00:00
Clarify dataset length in chapter 2 (#589)
This commit is contained in:
parent
4e3b752e5e
commit
0bdcce4e40
@ -1296,6 +1296,7 @@
|
||||
"\n",
|
||||
" # Tokenize the entire text\n",
|
||||
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
|
||||
" assert len(token_ids) > max_length, \"Number of tokenized inputs must at least be equal to max_length+1\"\n",
|
||||
"\n",
|
||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user