Use instance tokenizer (#116)

* Use instance tokenizer

* consistency updates

---------

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
This commit is contained in:
James Holcombe 2024-04-10 21:16:19 -04:00 committed by GitHub
parent 94f6582cff
commit 0b866c133f
11 changed files with 14 additions and 14 deletions

View File

@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@ -1273,7 +1273,7 @@
" self.target_ids = []\n",
"\n",
" # Tokenize the entire text\n",
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
"\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n",

View File

@ -48,7 +48,7 @@
" self.target_ids = []\n",
"\n",
" # Tokenize the entire text\n",
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
"\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n",
@ -150,7 +150,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.10"
}
},
"nbformat": 4,

View File

@ -256,7 +256,7 @@
" self.target_ids = []\n",
"\n",
" # Tokenize the entire text\n",
" token_ids = tokenizer.encode(txt)\n",
" token_ids = self.tokenizer.encode(txt)\n",
"\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n",
@ -377,7 +377,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.10"
}
},
"nbformat": 4,

View File

@ -78,7 +78,7 @@
" self.target_ids = []\n",
"\n",
" # Tokenize the entire text\n",
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
"\n",
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
" for i in range(0, len(token_ids) - max_length, stride):\n",
@ -374,7 +374,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.10"
}
},
"nbformat": 4,

View File

@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@ -16,7 +16,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):

View File

@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
self.input_ids = []
self.target_ids = []
token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
for i in range(0, len(token_ids) - max_length, stride):
input_chunk = token_ids[i:i + max_length]

View File

@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset):
self.target_ids = []
# Tokenize the entire text
token_ids = tokenizer.encode(txt)
token_ids = self.tokenizer.encode(txt)
# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):