add allowed_special={"<|endoftext|>"}

This commit is contained in:
rasbt 2024-06-09 06:04:02 -05:00
parent 40ba3a4068
commit e1adeb14f3
2 changed files with 3 additions and 3 deletions

View File

@ -264,7 +264,7 @@
" self.target_ids = []\n", " self.target_ids = []\n",
"\n", "\n",
" # Tokenize the entire text\n", " # Tokenize the entire text\n",
" token_ids = tokenizer.encode(txt)\n", " token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
"\n", "\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n", " for i in range(0, len(token_ids) - max_length, stride):\n",
@ -385,7 +385,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.4" "version": "3.10.6"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -18,7 +18,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = [] self.target_ids = []
# Tokenize the entire text # Tokenize the entire text
token_ids = tokenizer.encode(txt) token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
# Use a sliding window to chunk the book into overlapping sequences of max_length # Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride): for i in range(0, len(token_ids) - max_length, stride):