LLMs-from-scratch/ch03/02_bonus_efficient-multihead-attention/ch03.py

# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch
#
# This file contains the relevant code from chapter 3 that is going to be used
# in forthcoming chapters.

import torch
import torch.nn as nn


class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)  # New
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))  # New

    def forward(self, x):
        b, num_tokens, d_in = x.shape  # New batch dimension b
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attn_scores = queries @ keys.transpose(1, 2)  # Changed transpose
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)  # New

        context_vec = attn_weights @ values
        return context_vec


class MultiHeadAttentionWrapper(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias)
             for _ in range(num_heads)]
        )
        self.out_proj = nn.Linear(d_out*num_heads, d_out*num_heads)

    def forward(self, x):
        context_vec = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.out_proj(context_vec)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec
Try windows runners (#133) * try windows runners * update triggers * trigger with code file update * add new status badges 2024-04-28 07:39:23 -05:00			`# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).`
			`# Source for "Build a Large Language Model From Scratch"`
			`# - https://www.manning.com/books/build-a-large-language-model-from-scratch`
			`# Code: https://github.com/rasbt/LLMs-from-scratch`
			`#`
			`# This file contains the relevant code from chapter 3 that is going to be used`
			`# in forthcoming chapters.`

mha variants 2024-03-06 08:30:32 -06:00			`import torch`
			`import torch.nn as nn`


also add simple wrapper 2024-03-06 08:38:53 -06:00			`class CausalAttention(nn.Module):`

cleanup 2024-04-04 07:58:41 -05:00			`def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`super().__init__()`
			`self.d_out = d_out`
			`self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`self.dropout = nn.Dropout(dropout) # New`
cleanup 2024-04-04 07:58:41 -05:00			`self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New`
also add simple wrapper 2024-03-06 08:38:53 -06:00
			`def forward(self, x):`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`b, num_tokens, d_in = x.shape # New batch dimension b`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`keys = self.W_key(x)`
			`queries = self.W_query(x)`
			`values = self.W_value(x)`

Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`attn_scores = queries @ keys.transpose(1, 2) # Changed transpose`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`attn_scores.masked_fill_( # New, _ ops are in-place`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`attn_weights = self.dropout(attn_weights) # New`
also add simple wrapper 2024-03-06 08:38:53 -06:00
			`context_vec = attn_weights @ values`
			`return context_vec`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00

also add simple wrapper 2024-03-06 08:38:53 -06:00			`class MultiHeadAttentionWrapper(nn.Module):`

cleanup 2024-04-04 07:58:41 -05:00			`def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`super().__init__()`
			`self.heads = nn.ModuleList(`
cleanup 2024-04-04 07:58:41 -05:00			`[CausalAttention(d_in, d_out, context_length, dropout, qkv_bias)`
also add simple wrapper 2024-03-06 08:38:53 -06:00			`for _ in range(num_heads)]`
			`)`
Fix mha wrapper implementations in ch03 bonus 2024-03-13 18:02:26 +08:00			`self.out_proj = nn.Linear(d_outnum_heads, d_outnum_heads)`
also add simple wrapper 2024-03-06 08:38:53 -06:00
			`def forward(self, x):`
Fix mha wrapper implementations in ch03 bonus 2024-03-13 18:02:26 +08:00			`context_vec = torch.cat([head(x) for head in self.heads], dim=-1)`
			`return self.out_proj(context_vec)`
also add simple wrapper 2024-03-06 08:38:53 -06:00

mha variants 2024-03-06 08:30:32 -06:00			`class MultiHeadAttention(nn.Module):`
cleanup 2024-04-04 07:58:41 -05:00			`def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):`
mha variants 2024-03-06 08:30:32 -06:00			`super().__init__()`
			`assert d_out % num_heads == 0, "d_out must be divisible by num_heads"`

			`self.d_out = d_out`
			`self.num_heads = num_heads`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim`
mha variants 2024-03-06 08:30:32 -06:00
			`self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)`
			`self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)`
			`self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)`
			`self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs`
			`self.dropout = nn.Dropout(dropout)`
cleanup 2024-04-04 07:58:41 -05:00			`self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))`
mha variants 2024-03-06 08:30:32 -06:00
			`def forward(self, x):`
			`b, num_tokens, d_in = x.shape`

			`keys = self.W_key(x) # Shape: (b, num_tokens, d_out)`
			`queries = self.W_query(x)`
			`values = self.W_value(x)`

			# We implicitly split the matrix by adding a `num_heads` dimension
			`# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)`
mha variants 2024-03-06 08:30:32 -06:00			`values = values.view(b, num_tokens, self.num_heads, self.head_dim)`
			`queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)`

			`# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)`
			`keys = keys.transpose(1, 2)`
			`queries = queries.transpose(1, 2)`
			`values = values.transpose(1, 2)`

			`# Compute scaled dot-product attention (aka self-attention) with a causal mask`
			`attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head`
remove redundant unsqueeze in mask 2024-03-09 17:42:25 -06:00
mha variants 2024-03-06 08:30:32 -06:00			`# Original mask truncated to the number of tokens and converted to boolean`
			`mask_bool = self.mask.bool()[:num_tokens, :num_tokens]`
remove redundant unsqueeze in mask 2024-03-09 17:42:25 -06:00
			`# Use the mask to fill attention scores`
			`attn_scores.masked_fill_(mask_bool, -torch.inf)`
mha variants 2024-03-06 08:30:32 -06:00
			`attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)`
			`attn_weights = self.dropout(attn_weights)`

			`# Shape: (b, num_tokens, num_heads, head_dim)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`context_vec = (attn_weights @ values).transpose(1, 2)`
mha variants 2024-03-06 08:30:32 -06:00
			`# Combine heads, where self.d_out = self.num_heads * self.head_dim`
			`context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`context_vec = self.out_proj(context_vec) # optional projection`
mha variants 2024-03-06 08:30:32 -06:00
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`return context_vec`