mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-10-28 00:09:11 +00:00
Use instance tokenizer (#116)
* Use instance tokenizer * consistency updates --------- Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
This commit is contained in:
parent
94f6582cff
commit
0b866c133f
@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = self.tokenizer.encode(txt)
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -1273,7 +1273,7 @@
|
|||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
|
|||||||
@ -48,7 +48,7 @@
|
|||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
@ -150,7 +150,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.10.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -256,7 +256,7 @@
|
|||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = tokenizer.encode(txt)\n",
|
" token_ids = self.tokenizer.encode(txt)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
@ -377,7 +377,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.10.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -78,7 +78,7 @@
|
|||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
@ -374,7 +374,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.10.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = self.tokenizer.encode(txt)
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -16,7 +16,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = self.tokenizer.encode(txt)
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = self.tokenizer.encode(txt)
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = self.tokenizer.encode(txt)
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.input_ids = []
|
self.input_ids = []
|
||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
|
token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
|
||||||
|
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
input_chunk = token_ids[i:i + max_length]
|
input_chunk = token_ids[i:i + max_length]
|
||||||
|
|||||||
@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = self.tokenizer.encode(txt)
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user