mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-10-27 07:49:25 +00:00
add allowed_special={"<|endoftext|>"}
This commit is contained in:
parent
40ba3a4068
commit
e1adeb14f3
@ -264,7 +264,7 @@
|
|||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = tokenizer.encode(txt)\n",
|
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
@ -385,7 +385,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.4"
|
"version": "3.10.6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -18,7 +18,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user