mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-09-03 13:28:18 +00:00
drop_last=True
This commit is contained in:
parent
6243726ab3
commit
cdcd73ba7f
@ -26,7 +26,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"torch version: 2.0.1\n",
|
"torch version: 2.1.0\n",
|
||||||
"tiktoken version: 0.5.1\n"
|
"tiktoken version: 0.5.1\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -559,7 +559,7 @@
|
|||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
||||||
"Cell \u001b[0;32mIn[16], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
|
"Cell \u001b[0;32mIn[16], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
||||||
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
"Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
|
||||||
"\u001b[0;31mKeyError\u001b[0m: 'Hello'"
|
"\u001b[0;31mKeyError\u001b[0m: 'Hello'"
|
||||||
]
|
]
|
||||||
@ -1102,7 +1102,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"PyTorch version: 2.0.1\n"
|
"PyTorch version: 2.1.0\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -1159,7 +1159,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True):\n",
|
"def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):\n",
|
||||||
|
"\n",
|
||||||
" # Initialize the tokenizer\n",
|
" # Initialize the tokenizer\n",
|
||||||
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -1167,7 +1168,8 @@
|
|||||||
" dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
|
" dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Create dataloader\n",
|
" # Create dataloader\n",
|
||||||
" dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)\n",
|
" dataloader = DataLoader(\n",
|
||||||
|
" dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return dataloader"
|
" return dataloader"
|
||||||
]
|
]
|
||||||
@ -1638,7 +1640,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.4"
|
"version": "3.10.12"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -78,7 +78,8 @@
|
|||||||
" return self.input_ids[idx], self.target_ids[idx]\n",
|
" return self.input_ids[idx], self.target_ids[idx]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True):\n",
|
"def create_dataloader_v1(txt, batch_size=4, max_length=256, \n",
|
||||||
|
" stride=128, shuffle=True, drop_last=True):\n",
|
||||||
" # Initialize the tokenizer\n",
|
" # Initialize the tokenizer\n",
|
||||||
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -86,12 +87,12 @@
|
|||||||
" dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
|
" dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Create dataloader\n",
|
" # Create dataloader\n",
|
||||||
" dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)\n",
|
" dataloader = DataLoader(\n",
|
||||||
|
" dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return dataloader\n",
|
" return dataloader\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
|
||||||
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
|
||||||
" raw_text = f.read()\n",
|
" raw_text = f.read()\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -163,7 +164,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.4"
|
"version": "3.10.12"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -35,7 +35,8 @@ class GPTDatasetV1(Dataset):
|
|||||||
return self.input_ids[idx], self.target_ids[idx]
|
return self.input_ids[idx], self.target_ids[idx]
|
||||||
|
|
||||||
|
|
||||||
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True):
|
def create_dataloader_v1(txt, batch_size=4, max_length=256,
|
||||||
|
stride=128, shuffle=True, drop_last=True):
|
||||||
# Initialize the tokenizer
|
# Initialize the tokenizer
|
||||||
tokenizer = tiktoken.get_encoding("gpt2")
|
tokenizer = tiktoken.get_encoding("gpt2")
|
||||||
|
|
||||||
@ -43,7 +44,8 @@ def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=Tru
|
|||||||
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
|
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
|
||||||
|
|
||||||
# Create dataloader
|
# Create dataloader
|
||||||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
|
dataloader = DataLoader(
|
||||||
|
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
|
||||||
|
|
||||||
return dataloader
|
return dataloader
|
||||||
|
|
||||||
|
@ -27,7 +27,8 @@ class GPTDatasetV1(Dataset):
|
|||||||
return self.input_ids[idx], self.target_ids[idx]
|
return self.input_ids[idx], self.target_ids[idx]
|
||||||
|
|
||||||
|
|
||||||
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True):
|
def create_dataloader_v1(txt, batch_size=4, max_length=256,
|
||||||
|
stride=128, shuffle=True, drop_last=True):
|
||||||
# Initialize the tokenizer
|
# Initialize the tokenizer
|
||||||
tokenizer = tiktoken.get_encoding("gpt2")
|
tokenizer = tiktoken.get_encoding("gpt2")
|
||||||
|
|
||||||
@ -35,7 +36,8 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=
|
|||||||
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
|
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
|
||||||
|
|
||||||
# Create dataloader
|
# Create dataloader
|
||||||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
|
dataloader = DataLoader(
|
||||||
|
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
|
||||||
|
|
||||||
return dataloader
|
return dataloader
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user