From b827bf4eea5f99fc728385739be53aa82f1c12c9 Mon Sep 17 00:00:00 2001 From: rasbt Date: Thu, 29 Feb 2024 08:31:07 -0600 Subject: [PATCH] remove redundant double-unsequeeze --- ch03/01_main-chapter-code/ch03.ipynb | 22 ++++++------------- .../multihead-attention.ipynb | 4 ++-- ch04/01_main-chapter-code/gpt.py | 4 ++-- .../01_main-chapter-code/previous_chapters.py | 4 ++-- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/ch03/01_main-chapter-code/ch03.ipynb b/ch03/01_main-chapter-code/ch03.ipynb index 7ee5e71..cae40f0 100644 --- a/ch03/01_main-chapter-code/ch03.ipynb +++ b/ch03/01_main-chapter-code/ch03.ipynb @@ -1608,7 +1608,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 39, "id": "110b0188-6e9e-4e56-a988-10523c6c8538", "metadata": {}, "outputs": [ @@ -1672,8 +1672,8 @@ " attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head\n", " # Original mask truncated to the number of tokens and converted to boolean\n", " mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n", - " # Unsqueeze the mask twice to match dimensions\n", - " mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)\n", + " # Unsqueeze the mask to match dimensions\n", + " mask_unsqueezed = mask_bool.unsqueeze(0)\n", " # Use the unsqueezed mask to fill attention scores\n", " attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n", " \n", @@ -1729,7 +1729,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "id": "e8cfc1ae-78ab-4faa-bc73-98bd054806c9", "metadata": {}, "outputs": [ @@ -1772,7 +1772,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "id": "053760f1-1a02-42f0-b3bf-3d939e407039", "metadata": {}, "outputs": [ @@ -1804,7 +1804,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "id": "08c2a3fd-e674-4d69-9ef4-ea94b788e937", "metadata": {}, "outputs": [ @@ -1814,7 +1814,7 @@ "2360064" ] }, - "execution_count": 40, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -1847,14 +1847,6 @@ "source": [ "- See the [./multihead-attention.ipynb](./multihead-attention.ipynb) code notebook, which is a concise version of the data loader (chapter 2) plus the multi-head attention class that we implemented in this chapter and will need for training the GPT model in upcoming chapters." ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f5b7a94-78d0-49d5-896f-21696cb331b7", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/ch03/01_main-chapter-code/multihead-attention.ipynb b/ch03/01_main-chapter-code/multihead-attention.ipynb index 37f7c50..857316f 100644 --- a/ch03/01_main-chapter-code/multihead-attention.ipynb +++ b/ch03/01_main-chapter-code/multihead-attention.ipynb @@ -278,8 +278,8 @@ " attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head\n", " # Original mask truncated to the number of tokens and converted to boolean\n", " mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n", - " # Unsqueeze the mask twice to match dimensions\n", - " mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)\n", + " # Unsqueeze the mask to match dimensions\n", + " mask_unsqueezed = mask_bool.unsqueeze(0)\n", " # Use the unsqueezed mask to fill attention scores\n", " attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n", " \n", diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py index b3e1335..b103b72 100644 --- a/ch04/01_main-chapter-code/gpt.py +++ b/ch04/01_main-chapter-code/gpt.py @@ -91,8 +91,8 @@ class MultiHeadAttention(nn.Module): attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Unsqueeze the mask twice to match dimensions - mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0) + # Unsqueeze the mask to match dimensions + mask_unsqueezed = mask_bool.unsqueeze(0) # Use the unsqueezed mask to fill attention scores attn_scores.masked_fill_(mask_unsqueezed, -torch.inf) diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py index 28426cd..926dba4 100644 --- a/ch04/01_main-chapter-code/previous_chapters.py +++ b/ch04/01_main-chapter-code/previous_chapters.py @@ -80,8 +80,8 @@ class MultiHeadAttention(nn.Module): attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] - # Unsqueeze the mask twice to match dimensions - mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0) + # Unsqueeze the mask to match dimensions + mask_unsqueezed = mask_bool.unsqueeze(0) # Use the unsqueezed mask to fill attention scores attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)