diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb index 61f1d74..dc4e3ec 100644 --- a/ch04/01_main-chapter-code/ch04.ipynb +++ b/ch04/01_main-chapter-code/ch04.ipynb @@ -113,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 2, "id": "619c2eed-f8ea-4ff5-92c3-feda0f29b227", "metadata": {}, "outputs": [], @@ -181,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 3, "id": "794b6b6c-d36f-411e-a7db-8ac566a87fee", "metadata": {}, "outputs": [ @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 4, "id": "009238cd-0160-4834-979c-309710986bb0", "metadata": {}, "outputs": [ @@ -279,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "id": "79e1b463-dc3f-44ac-9cdb-9d5b6f64eb9d", "metadata": {}, "outputs": [ @@ -314,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 6, "id": "9888f79e-8e69-44aa-8a19-cd34292adbf5", "metadata": {}, "outputs": [ @@ -365,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 7, "id": "9a1d1bb9-3341-4c9a-bc2a-d2489bf89cda", "metadata": {}, "outputs": [ @@ -378,8 +378,8 @@ " [-0.0189, 0.1121, -1.0876, 1.5173, 0.5647, -1.0876]],\n", " grad_fn=)\n", "Mean:\n", - " tensor([[ 0.0000],\n", - " [ 0.0000]], grad_fn=)\n", + " tensor([[2.9802e-08],\n", + " [3.9736e-08]], grad_fn=)\n", "Variance:\n", " tensor([[1.],\n", " [1.]], grad_fn=)\n" @@ -406,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "id": "3e06c34b-c68a-4b36-afbe-b30eda4eca39", "metadata": {}, "outputs": [ @@ -440,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 9, "id": "3333a305-aa3d-460a-bcce-b80662d464d9", "metadata": {}, "outputs": [], @@ -482,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 10, "id": "23b1000a-e613-4b43-bd90-e54deed8d292", "metadata": {}, "outputs": [], @@ -493,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "id": "94c12de2-1cab-46e0-a099-e2e470353bff", "metadata": {}, "outputs": [ @@ -558,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 12, "id": "f84694b7-95f3-4323-b6d6-0a73df278e82", "metadata": {}, "outputs": [], @@ -576,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 13, "id": "fc5487d2-2576-4118-80a7-56c4caac2e71", "metadata": {}, "outputs": [ @@ -626,7 +626,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 14, "id": "9275c879-b148-4579-a107-86827ca14d4d", "metadata": {}, "outputs": [], @@ -647,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 15, "id": "7c4976e2-0261-418e-b042-c5be98c2ccaf", "metadata": {}, "outputs": [ @@ -673,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 16, "id": "928e7f7c-d0b1-499f-8d07-4cadb428a6f9", "metadata": {}, "outputs": [ @@ -734,7 +734,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 17, "id": "05473938-799c-49fd-86d4-8ed65f94fee6", "metadata": {}, "outputs": [], @@ -792,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 18, "id": "c75f43cc-6923-4018-b980-26023086572c", "metadata": {}, "outputs": [ @@ -830,7 +830,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 19, "id": "11b7c0c2-f9dd-4dd5-b096-a05c48c5f6d6", "metadata": {}, "outputs": [ @@ -883,7 +883,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 20, "id": "0e1e8176-e5e3-4152-b1aa-0bbd7891dfd9", "metadata": {}, "outputs": [], @@ -943,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 21, "id": "3fb45a63-b1f3-4b08-b525-dafbc8228405", "metadata": {}, "outputs": [ @@ -969,7 +969,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 22, "id": "01e737a6-fc99-42bb-9f7e-4da899168811", "metadata": {}, "outputs": [ @@ -1036,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 23, "id": "c61de39c-d03c-4a32-8b57-f49ac3834857", "metadata": {}, "outputs": [], @@ -1061,6 +1061,7 @@ " tok_embeds = self.tok_emb(in_idx)\n", " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", " x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n", + " x = self.drop_emb(x)\n", " x = self.trf_blocks(x)\n", " x = self.final_norm(x)\n", " logits = self.out_head(x)\n", @@ -1075,6 +1076,44 @@ "- Using the configuration of the 124M parameter model, we can now instantiate this GPT model with random initial weights as follows:" ] }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ef94fd9c-4e9d-470d-8f8e-dd23d1bb1f64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input batch:\n", + " tensor([[6109, 3626, 6100, 345],\n", + " [6109, 1110, 6622, 257]])\n", + "\n", + "Output shape: torch.Size([2, 4, 50257])\n", + "tensor([[[ 0.6525, 0.5753, 0.0174, ..., 0.2988, 0.1441, 0.0032],\n", + " [ 0.0839, -0.6789, -0.6605, ..., -0.2912, 0.4267, -0.2696],\n", + " [ 0.8440, 0.1894, 0.0708, ..., 0.0982, -0.2183, 0.0920],\n", + " [-0.7958, 0.5066, 0.0209, ..., 0.7497, 0.3233, -0.1251]],\n", + "\n", + " [[ 0.0181, 0.2606, -0.3022, ..., 0.2940, 0.1998, -0.6246],\n", + " [ 0.0596, 0.3041, -0.0293, ..., 0.6796, -0.1226, 0.1303],\n", + " [ 1.1895, 1.0891, 0.0237, ..., 0.8299, 0.1794, -0.2250],\n", + " [ 0.5457, 0.1861, 0.3872, ..., 1.3537, -0.4062, -0.0268]]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "torch.manual_seed(123)\n", + "model = GPTModel(GPT_CONFIG_124M)\n", + "\n", + "out = model(batch)\n", + "print(\"Input batch:\\n\", batch)\n", + "print(\"\\nOutput shape:\", out.shape)\n", + "print(out)" + ] + }, { "cell_type": "code", "execution_count": 44, diff --git a/ch04/01_main-chapter-code/exercise-solutions.ipynb b/ch04/01_main-chapter-code/exercise-solutions.ipynb index 5167827..ed2135e 100644 --- a/ch04/01_main-chapter-code/exercise-solutions.ipynb +++ b/ch04/01_main-chapter-code/exercise-solutions.ipynb @@ -336,6 +336,7 @@ " tok_embeds = self.tok_emb(in_idx)\n", " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", " x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n", + " x = self.drop_emb(x)\n", " x = self.trf_blocks(x)\n", " x = self.final_norm(x)\n", " logits = self.out_head(x)\n", diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index b103b72..f6dde98 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -202,6 +202,7 @@ class GPTModel(nn.Module): tok_embeds = self.tok_emb(in_idx) pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] + x = self.drop_emb(x) x = self.trf_blocks(x) x = self.final_norm(x) logits = self.out_head(x) diff --git a/ch05/02_hparam_tuning/previous_chapters.py b/ch05/02_hparam_tuning/previous_chapters.py index b3e1335..6b1a00e 100644 --- a/ch05/02_hparam_tuning/previous_chapters.py +++ b/ch05/02_hparam_tuning/previous_chapters.py @@ -202,6 +202,7 @@ class GPTModel(nn.Module): tok_embeds = self.tok_emb(in_idx) pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] + x = self.drop_emb(x) x = self.trf_blocks(x) x = self.final_norm(x) logits = self.out_head(x)