diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index ba0d757..5d24c15 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -26,7 +26,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "torch version: 2.0.1\n", + "torch version: 2.1.0\n", "tiktoken version: 0.5.1\n" ] } @@ -559,7 +559,7 @@ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[16], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m 3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m \u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m \u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43ms\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mpreprocessed\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", + "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", "Cell \u001b[0;32mIn[12], line 9\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m 8\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()]\n\u001b[0;32m----> 9\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstr_to_int\u001b[49m\u001b[43m[\u001b[49m\u001b[43ms\u001b[49m\u001b[43m]\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n", "\u001b[0;31mKeyError\u001b[0m: 'Hello'" ] @@ -1102,7 +1102,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "PyTorch version: 2.0.1\n" + "PyTorch version: 2.1.0\n" ] } ], @@ -1159,7 +1159,8 @@ "metadata": {}, "outputs": [], "source": [ - "def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True):\n", + "def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):\n", + "\n", " # Initialize the tokenizer\n", " tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", @@ -1167,7 +1168,8 @@ " dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n", "\n", " # Create dataloader\n", - " dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)\n", + " dataloader = DataLoader(\n", + " dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)\n", "\n", " return dataloader" ] @@ -1638,7 +1640,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/ch02/01_main-chapter-code/dataloader.ipynb b/ch02/01_main-chapter-code/dataloader.ipynb index b6ce60c..519ca43 100644 --- a/ch02/01_main-chapter-code/dataloader.ipynb +++ b/ch02/01_main-chapter-code/dataloader.ipynb @@ -78,7 +78,8 @@ " return self.input_ids[idx], self.target_ids[idx]\n", "\n", "\n", - "def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True):\n", + "def create_dataloader_v1(txt, batch_size=4, max_length=256, \n", + " stride=128, shuffle=True, drop_last=True):\n", " # Initialize the tokenizer\n", " tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", @@ -86,12 +87,12 @@ " dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n", "\n", " # Create dataloader\n", - " dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)\n", + " dataloader = DataLoader(\n", + " dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)\n", "\n", " return dataloader\n", "\n", "\n", - "\n", "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", " raw_text = f.read()\n", "\n", @@ -163,7 +164,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index baa58b2..b3e1335 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -35,7 +35,8 @@ class GPTDatasetV1(Dataset): return self.input_ids[idx], self.target_ids[idx] -def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True): +def create_dataloader_v1(txt, batch_size=4, max_length=256, + stride=128, shuffle=True, drop_last=True): # Initialize the tokenizer tokenizer = tiktoken.get_encoding("gpt2") @@ -43,7 +44,8 @@ def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=Tru dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # Create dataloader - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) return dataloader diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py index b328743..28426cd 100644 --- a/ch04/01_main-chapter-code/previous_chapters.py +++ b/ch04/01_main-chapter-code/previous_chapters.py @@ -27,7 +27,8 @@ class GPTDatasetV1(Dataset): return self.input_ids[idx], self.target_ids[idx] -def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True): +def create_dataloader_v1(txt, batch_size=4, max_length=256, + stride=128, shuffle=True, drop_last=True): # Initialize the tokenizer tokenizer = tiktoken.get_encoding("gpt2") @@ -35,7 +36,8 @@ def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle= dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # Create dataloader - dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) + dataloader = DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) return dataloader