mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-10-29 17:01:30 +00:00
Remove leftover instances of self.tokenizer (#201)
* Remove leftover instances of self.tokenizer * add endoftext token
This commit is contained in:
parent
98d23751f7
commit
40ba3a4068
@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -28,12 +28,11 @@ from torch.utils.data import Dataset, DataLoader
|
|||||||
|
|
||||||
class GPTDatasetV1(Dataset):
|
class GPTDatasetV1(Dataset):
|
||||||
def __init__(self, txt, tokenizer, max_length, stride):
|
def __init__(self, txt, tokenizer, max_length, stride):
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.input_ids = []
|
self.input_ids = []
|
||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -1920,7 +1920,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.11.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -248,7 +248,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 11,
|
||||||
"id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9",
|
"id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -260,12 +260,11 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"class GPTDatasetV1(Dataset):\n",
|
"class GPTDatasetV1(Dataset):\n",
|
||||||
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
||||||
" self.tokenizer = tokenizer\n",
|
|
||||||
" self.input_ids = []\n",
|
" self.input_ids = []\n",
|
||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = self.tokenizer.encode(txt)\n",
|
" token_ids = tokenizer.encode(txt)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
@ -311,7 +310,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 12,
|
||||||
"id": "0128eefa-d7c8-4f76-9851-566dfa7c3745",
|
"id": "0128eefa-d7c8-4f76-9851-566dfa7c3745",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -324,7 +323,7 @@
|
|||||||
" [ 402, 271]])"
|
" [ 402, 271]])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 11,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -341,7 +340,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 13,
|
||||||
"id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c",
|
"id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -354,7 +353,7 @@
|
|||||||
" [ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])"
|
" [ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 12,
|
"execution_count": 13,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
|||||||
@ -82,12 +82,11 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"class GPTDatasetV1(Dataset):\n",
|
"class GPTDatasetV1(Dataset):\n",
|
||||||
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
||||||
" self.tokenizer = tokenizer\n",
|
|
||||||
" self.input_ids = []\n",
|
" self.input_ids = []\n",
|
||||||
" self.target_ids = []\n",
|
" self.target_ids = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Tokenize the entire text\n",
|
" # Tokenize the entire text\n",
|
||||||
" token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
" token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
||||||
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
||||||
|
|||||||
@ -15,7 +15,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = self.tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
|
|||||||
|
|
||||||
class GPTDatasetV1(Dataset):
|
class GPTDatasetV1(Dataset):
|
||||||
def __init__(self, txt, tokenizer, max_length, stride):
|
def __init__(self, txt, tokenizer, max_length, stride):
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.input_ids = []
|
self.input_ids = []
|
||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
|
|||||||
|
|
||||||
class GPTDatasetV1(Dataset):
|
class GPTDatasetV1(Dataset):
|
||||||
def __init__(self, txt, tokenizer, max_length, stride):
|
def __init__(self, txt, tokenizer, max_length, stride):
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.input_ids = []
|
self.input_ids = []
|
||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
|
|||||||
self.target_ids = []
|
self.target_ids = []
|
||||||
|
|
||||||
# Tokenize the entire text
|
# Tokenize the entire text
|
||||||
token_ids = tokenizer.encode(txt)
|
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
|
||||||
|
|
||||||
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
# Use a sliding window to chunk the book into overlapping sequences of max_length
|
||||||
for i in range(0, len(token_ids) - max_length, stride):
|
for i in range(0, len(token_ids) - max_length, stride):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user