diff --git a/.gitignore b/.gitignore index c4987cc..2b175ed 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ ch05/02_alternative_weight_loading/checkpoints ch05/01_main-chapter-code/model.pth ch05/01_main-chapter-code/model_and_optimizer.pth ch05/03_bonus_pretraining_on_gutenberg/model_checkpoints +ch06/01_main-chapter-code/gpt2 # Datasets ch02/01_main-chapter-code/number-data.txt diff --git a/ch06/01_main-chapter-code/ch06.ipynb b/ch06/01_main-chapter-code/ch06.ipynb index a68fc18..7203226 100644 --- a/ch06/01_main-chapter-code/ch06.ipynb +++ b/ch06/01_main-chapter-code/ch06.ipynb @@ -628,7 +628,10 @@ " def __getitem__(self, index):\n", " encoded = self.encoded_texts[index]\n", " label = self.data.iloc[index][\"Label\"]\n", - " return torch.tensor(encoded, dtype=torch.long), torch.tensor(label, dtype=torch.long)\n", + " return (\n", + " torch.tensor(encoded, dtype=torch.long),\n", + " torch.tensor(label, dtype=torch.long)\n", + " )\n", "\n", " def __len__(self):\n", " return len(self.data)\n", @@ -663,7 +666,12 @@ } ], "source": [ - "train_dataset = SpamDataset(\"train.csv\", max_length=None, tokenizer=tokenizer)\n", + "train_dataset = SpamDataset(\n", + " csv_file=\"train.csv\",\n", + " max_length=None,\n", + " tokenizer=tokenizer\n", + ")\n", + "\n", "print(train_dataset.max_length)" ] }, @@ -686,8 +694,16 @@ }, "outputs": [], "source": [ - "val_dataset = SpamDataset(\"validation.csv\", max_length=train_dataset.max_length, tokenizer=tokenizer)\n", - "test_dataset = SpamDataset(\"test.csv\", max_length=train_dataset.max_length, tokenizer=tokenizer)" + "val_dataset = SpamDataset(\n", + " csv_file=\"validation.csv\",\n", + " max_length=train_dataset.max_length,\n", + " tokenizer=tokenizer\n", + ")\n", + "test_dataset = SpamDataset(\n", + " csv_file=\"test.csv\",\n", + " max_length=train_dataset.max_length,\n", + " tokenizer=tokenizer\n", + ")" ] }, { @@ -695,7 +711,7 @@ "id": "20170d89-85a0-4844-9887-832f5d23432a", "metadata": {}, "source": [ - "- Next, we use the dataset to instantiate the data loaders, which is similar to creating the data loaders in previous chapters:" + "- Next, we use the dataset to instantiate the data loaders, which is similar to creating the data loaders in previous chapters" ] }, { @@ -787,7 +803,7 @@ "id": "5cdd7947-7039-49bf-8a5e-c0a2f4281ca1", "metadata": {}, "source": [ - "- Lastly, let's print the total number of batches in each dataset:" + "- Lastly, let's print the total number of batches in each dataset" ] }, { @@ -880,16 +896,16 @@ }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "File already exists and is up-to-date: gpt2/124M/checkpoint\n", - "File already exists and is up-to-date: gpt2/124M/encoder.json\n", - "File already exists and is up-to-date: gpt2/124M/hparams.json\n", - "File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n", - "File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n", - "File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n", - "File already exists and is up-to-date: gpt2/124M/vocab.bpe\n" + "checkpoint: 100%|███████████████████████████| 77.0/77.0 [00:00<00:00, 39.7kiB/s]\n", + "encoder.json: 100%|███████████████████████| 1.04M/1.04M [00:00<00:00, 3.25MiB/s]\n", + "hparams.json: 100%|█████████████████████████| 90.0/90.0 [00:00<00:00, 51.4kiB/s]\n", + "model.ckpt.data-00000-of-00001: 100%|███████| 498M/498M [01:00<00:00, 8.20MiB/s]\n", + "model.ckpt.index: 100%|███████████████████| 5.21k/5.21k [00:00<00:00, 2.34MiB/s]\n", + "model.ckpt.meta: 100%|██████████████████████| 471k/471k [00:00<00:00, 2.26MiB/s]\n", + "vocab.bpe: 100%|████████████████████████████| 456k/456k [00:00<00:00, 2.62MiB/s]\n" ] } ],