diff --git a/ch03/01_main-chapter-code/ch03.ipynb b/ch03/01_main-chapter-code/ch03.ipynb
index 734bcdf..717af34 100644
--- a/ch03/01_main-chapter-code/ch03.ipynb
+++ b/ch03/01_main-chapter-code/ch03.ipynb
@@ -1608,7 +1608,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 42,
    "id": "110b0188-6e9e-4e56-a988-10523c6c8538",
    "metadata": {},
    "outputs": [
@@ -1670,12 +1670,12 @@
     "\n",
     "        # Compute scaled dot-product attention (aka self-attention) with a causal mask\n",
     "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
+    "\n",
     "        # Original mask truncated to the number of tokens and converted to boolean\n",
     "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
-    "        # Unsqueeze the mask to match dimensions\n",
-    "        mask_unsqueezed = mask_bool.unsqueeze(0)\n",
-    "        # Use the unsqueezed mask to fill attention scores\n",
-    "        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n",
+    "\n",
+    "        # Use the mask to fill attention scores\n",
+    "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
     "        \n",
     "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
     "        attn_weights = self.dropout(attn_weights)\n",
@@ -1865,7 +1865,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb
index 2a072d3..d173843 100644
--- a/ch03/01_main-chapter-code/multihead-attention.ipynb
+++ b/ch03/01_main-chapter-code/multihead-attention.ipynb
@@ -148,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "a44e682d-1c3c-445d-85fa-b142f89f8503",
    "metadata": {},
    "outputs": [],
@@ -196,7 +196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "7898551e-f582-48ac-9f66-3632abe2a93f",
    "metadata": {},
    "outputs": [
@@ -235,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
    "id": "2773c09d-c136-4372-a2be-04b58d292842",
    "metadata": {},
    "outputs": [],
@@ -276,12 +276,12 @@
     "\n",
     "        # Compute scaled dot-product attention (aka self-attention) with a causal mask\n",
     "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
+    "        \n",
     "        # Original mask truncated to the number of tokens and converted to boolean\n",
     "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
-    "        # Unsqueeze the mask to match dimensions\n",
-    "        mask_unsqueezed = mask_bool.unsqueeze(0)\n",
-    "        # Use the unsqueezed mask to fill attention scores\n",
-    "        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n",
+    "\n",
+    "        # Use the mask to fill attention scores\n",
+    "        attn_scores.masked_fill_(mask_bool, -torch.inf)\n",
     "        \n",
     "        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n",
     "        attn_weights = self.dropout(attn_weights)\n",
@@ -298,7 +298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 11,
    "id": "779fdd04-0152-4308-af08-840800a7f395",
    "metadata": {},
    "outputs": [
@@ -324,6 +324,14 @@
     "\n",
     "print(\"context_vecs.shape:\", context_vecs.shape)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ac01b16-8ac6-4487-a6f2-fd9cf33a9fe4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -342,7 +350,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/ch03/02_bonus_efficient-multihead-attention/ch03.py b/ch03/02_bonus_efficient-multihead-attention/ch03.py
index ed77eb0..3be1cdb 100644
--- a/ch03/02_bonus_efficient-multihead-attention/ch03.py
+++ b/ch03/02_bonus_efficient-multihead-attention/ch03.py
@@ -79,12 +79,12 @@ class MultiHeadAttention(nn.Module):
 
         # Compute scaled dot-product attention (aka self-attention) with a causal mask
         attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
+
         # Original mask truncated to the number of tokens and converted to boolean
         mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
-        # Unsqueeze the mask to match dimensions
-        mask_unsqueezed = mask_bool.unsqueeze(0)
-        # Use the unsqueezed mask to fill attention scores
-        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)
+
+        # Use the mask to fill attention scores
+        attn_scores.masked_fill_(mask_bool, -torch.inf)
 
         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
         attn_weights = self.dropout(attn_weights)
diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
index 85a250e..227437d 100644
--- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
+++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb
@@ -544,7 +544,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "914 ms ± 50.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "1.15 s ± 86.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -569,7 +569,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "252 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "273 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -594,7 +594,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "300 ms ± 8.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "324 ms ± 17.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -619,7 +619,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "94.2 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "106 ms ± 598 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
      ]
     }
    ],
@@ -644,7 +644,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "297 ms ± 2.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "351 ms ± 7.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -665,7 +665,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "274 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "333 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py
index 8390ddb..2c6ca71 100644
--- a/ch04/01_main-chapter-code/gpt.py
+++ b/ch04/01_main-chapter-code/gpt.py
@@ -89,12 +89,12 @@ class MultiHeadAttention(nn.Module):
 
         # Compute scaled dot-product attention (aka self-attention) with a causal mask
         attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
+
         # Original mask truncated to the number of tokens and converted to boolean
         mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
-        # Unsqueeze the mask to match dimensions
-        mask_unsqueezed = mask_bool.unsqueeze(0)
-        # Use the unsqueezed mask to fill attention scores
-        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)
+
+        # Use the mask to fill attention scores
+        attn_scores.masked_fill_(mask_bool, -torch.inf)
 
         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
         attn_weights = self.dropout(attn_weights)
diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py
index 21b2edf..ec8c3c7 100644
--- a/ch04/01_main-chapter-code/previous_chapters.py
+++ b/ch04/01_main-chapter-code/previous_chapters.py
@@ -78,12 +78,12 @@ class MultiHeadAttention(nn.Module):
 
         # Compute scaled dot-product attention (aka self-attention) with a causal mask
         attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
+
         # Original mask truncated to the number of tokens and converted to boolean
         mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
-        # Unsqueeze the mask to match dimensions
-        mask_unsqueezed = mask_bool.unsqueeze(0)
-        # Use the unsqueezed mask to fill attention scores
-        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)
+
+        # Use the mask to fill attention scores
+        attn_scores.masked_fill_(mask_bool, -torch.inf)
         
         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
         attn_weights = self.dropout(attn_weights)
diff --git a/ch05/02_hparam_tuning/previous_chapters.py b/ch05/02_hparam_tuning/previous_chapters.py
index fc8f64b..2c6ca71 100644
--- a/ch05/02_hparam_tuning/previous_chapters.py
+++ b/ch05/02_hparam_tuning/previous_chapters.py
@@ -89,12 +89,12 @@ class MultiHeadAttention(nn.Module):
 
         # Compute scaled dot-product attention (aka self-attention) with a causal mask
         attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
+
         # Original mask truncated to the number of tokens and converted to boolean
         mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
-        # Unsqueeze the mask twice to match dimensions
-        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
-        # Use the unsqueezed mask to fill attention scores
-        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)
+
+        # Use the mask to fill attention scores
+        attn_scores.masked_fill_(mask_bool, -torch.inf)
 
         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
         attn_weights = self.dropout(attn_weights)