diff --git a/ch03/01_main-chapter-code/ch03.ipynb b/ch03/01_main-chapter-code/ch03.ipynb index 734bcdf..717af34 100644 --- a/ch03/01_main-chapter-code/ch03.ipynb +++ b/ch03/01_main-chapter-code/ch03.ipynb @@ -1608,7 +1608,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 42, "id": "110b0188-6e9e-4e56-a988-10523c6c8538", "metadata": {}, "outputs": [ @@ -1670,12 +1670,12 @@ "\n", " # Compute scaled dot-product attention (aka self-attention) with a causal mask\n", " attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head\n", + "\n", " # Original mask truncated to the number of tokens and converted to boolean\n", " mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n", - " # Unsqueeze the mask to match dimensions\n", - " mask_unsqueezed = mask_bool.unsqueeze(0)\n", - " # Use the unsqueezed mask to fill attention scores\n", - " attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n", + "\n", + " # Use the mask to fill attention scores\n", + " attn_scores.masked_fill_(mask_bool, -torch.inf)\n", " \n", " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n", " attn_weights = self.dropout(attn_weights)\n", @@ -1865,7 +1865,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb index 2a072d3..d173843 100644 --- a/ch03/01_main-chapter-code/multihead-attention.ipynb +++ b/ch03/01_main-chapter-code/multihead-attention.ipynb @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "a44e682d-1c3c-445d-85fa-b142f89f8503", "metadata": {}, "outputs": [], @@ -196,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "7898551e-f582-48ac-9f66-3632abe2a93f", "metadata": {}, "outputs": [ @@ -235,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "id": "2773c09d-c136-4372-a2be-04b58d292842", "metadata": {}, "outputs": [], @@ -276,12 +276,12 @@ "\n", " # Compute scaled dot-product attention (aka self-attention) with a causal mask\n", " attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head\n", + " \n", " # Original mask truncated to the number of tokens and converted to boolean\n", " mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n", - " # Unsqueeze the mask to match dimensions\n", - " mask_unsqueezed = mask_bool.unsqueeze(0)\n", - " # Use the unsqueezed mask to fill attention scores\n", - " attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n", + "\n", + " # Use the mask to fill attention scores\n", + " attn_scores.masked_fill_(mask_bool, -torch.inf)\n", " \n", " attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)\n", " attn_weights = self.dropout(attn_weights)\n", @@ -298,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "id": "779fdd04-0152-4308-af08-840800a7f395", "metadata": {}, "outputs": [ @@ -324,6 +324,14 @@ "\n", "print(\"context_vecs.shape:\", context_vecs.shape)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ac01b16-8ac6-4487-a6f2-fd9cf33a9fe4", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -342,7 +350,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/ch03/02_bonus_efficient-multihead-attention/ch03.py b/ch03/02_bonus_efficient-multihead-attention/ch03.py index ed77eb0..3be1cdb 100644 --- a/ch03/02_bonus_efficient-multihead-attention/ch03.py +++ b/ch03/02_bonus_efficient-multihead-attention/ch03.py @@ -79,12 +79,12 @@ class MultiHeadAttention(nn.Module): # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Unsqueeze the mask to match dimensions - mask_unsqueezed = mask_bool.unsqueeze(0) - # Use the unsqueezed mask to fill attention scores - attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights) diff --git a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb index 85a250e..227437d 100644 --- a/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb +++ b/ch03/02_bonus_efficient-multihead-attention/mha-implementations.ipynb @@ -544,7 +544,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "914 ms ± 50.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "1.15 s ± 86.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -569,7 +569,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "252 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "273 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -594,7 +594,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "300 ms ± 8.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "324 ms ± 17.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -619,7 +619,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "94.2 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "106 ms ± 598 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -644,7 +644,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "297 ms ± 2.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "351 ms ± 7.88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -665,7 +665,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "274 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "333 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index 8390ddb..2c6ca71 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -89,12 +89,12 @@ class MultiHeadAttention(nn.Module): # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Unsqueeze the mask to match dimensions - mask_unsqueezed = mask_bool.unsqueeze(0) - # Use the unsqueezed mask to fill attention scores - attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights) diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py index 21b2edf..ec8c3c7 100644 --- a/ch04/01_main-chapter-code/previous_chapters.py +++ b/ch04/01_main-chapter-code/previous_chapters.py @@ -78,12 +78,12 @@ class MultiHeadAttention(nn.Module): # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Unsqueeze the mask to match dimensions - mask_unsqueezed = mask_bool.unsqueeze(0) - # Use the unsqueezed mask to fill attention scores - attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights) diff --git a/ch05/02_hparam_tuning/previous_chapters.py b/ch05/02_hparam_tuning/previous_chapters.py index fc8f64b..2c6ca71 100644 --- a/ch05/02_hparam_tuning/previous_chapters.py +++ b/ch05/02_hparam_tuning/previous_chapters.py @@ -89,12 +89,12 @@ class MultiHeadAttention(nn.Module): # Compute scaled dot-product attention (aka self-attention) with a causal mask attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head + # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Unsqueeze the mask twice to match dimensions - mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0) - # Use the unsqueezed mask to fill attention scores - attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) + + # Use the mask to fill attention scores + attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) attn_weights = self.dropout(attn_weights)