LLMs-from-scratch/ch04/01_main-chapter-code/previous_chapters.py

# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec
Ch05 supplementary code (#81) 2024-03-19 09:26:26 -05:00			`# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).`
			`# Source for "Build a Large Language Model From Scratch"`
			`# - https://www.manning.com/books/build-a-large-language-model-from-scratch`
			`# Code: https://github.com/rasbt/LLMs-from-scratch`

add code backbone ch04 2024-01-29 08:13:52 -06:00			`import tiktoken`
			`import torch`
			`import torch.nn as nn`
			`from torch.utils.data import Dataset, DataLoader`


			`class GPTDatasetV1(Dataset):`
			`def __init__(self, txt, tokenizer, max_length, stride):`
			`self.input_ids = []`
			`self.target_ids = []`

			`# Tokenize the entire text`
Remove leftover instances of self.tokenizer (#201) * Remove leftover instances of self.tokenizer * add endoftext token 2024-06-08 14:57:34 -05:00			`token_ids = tokenizer.encode(txt, allowed_special={"<\|endoftext\|>"})`
add code backbone ch04 2024-01-29 08:13:52 -06:00
			`# Use a sliding window to chunk the book into overlapping sequences of max_length`
			`for i in range(0, len(token_ids) - max_length, stride):`
			`input_chunk = token_ids[i:i + max_length]`
			`target_chunk = token_ids[i + 1: i + max_length + 1]`
			`self.input_ids.append(torch.tensor(input_chunk))`
			`self.target_ids.append(torch.tensor(target_chunk))`

			`def __len__(self):`
			`return len(self.input_ids)`

			`def __getitem__(self, idx):`
			`return self.input_ids[idx], self.target_ids[idx]`


Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`def create_dataloader_v1(txt, batch_size=4, max_length=256,`
Make datesets and loaders compatible with multiprocessing (#118) 2024-04-13 14:57:56 -04:00			`stride=128, shuffle=True, drop_last=True, num_workers=0):`
add code backbone ch04 2024-01-29 08:13:52 -06:00			`# Initialize the tokenizer`
			`tokenizer = tiktoken.get_encoding("gpt2")`

			`# Create dataset`
			`dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)`

			`# Create dataloader`
drop_last=True 2024-02-25 07:23:38 -06:00			`dataloader = DataLoader(`
fixed num_workers (#229) * fixed num_workers * ch06 & ch07: added num_workers to create_dataloader_v1 2024-06-20 00:36:46 +02:00			`dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)`
add code backbone ch04 2024-01-29 08:13:52 -06:00
			`return dataloader`


			`class MultiHeadAttention(nn.Module):`
Rename variable to context_length to make it easier on readers (#106) * rename to context length * fix spacing 2024-04-04 07:27:41 -05:00			`def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):`
add code backbone ch04 2024-01-29 08:13:52 -06:00			`super().__init__()`
mha variants 2024-03-06 08:30:32 -06:00			`assert d_out % num_heads == 0, "d_out must be divisible by num_heads"`
add code backbone ch04 2024-01-29 08:13:52 -06:00
			`self.d_out = d_out`
			`self.num_heads = num_heads`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim`
add code backbone ch04 2024-01-29 08:13:52 -06:00
			`self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)`
			`self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)`
			`self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)`
			`self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs`
			`self.dropout = nn.Dropout(dropout)`
Rename variable to context_length to make it easier on readers (#106) * rename to context length * fix spacing 2024-04-04 07:27:41 -05:00			`self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))`
add code backbone ch04 2024-01-29 08:13:52 -06:00
			`def forward(self, x):`
			`b, num_tokens, d_in = x.shape`

Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`keys = self.W_key(x) # Shape: (b, num_tokens, d_out)`
add code backbone ch04 2024-01-29 08:13:52 -06:00			`queries = self.W_query(x)`
			`values = self.W_value(x)`

			# We implicitly split the matrix by adding a `num_heads` dimension
			`# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)`
add code backbone ch04 2024-01-29 08:13:52 -06:00			`values = values.view(b, num_tokens, self.num_heads, self.head_dim)`
			`queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)`

			`# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)`
			`keys = keys.transpose(1, 2)`
			`queries = queries.transpose(1, 2)`
			`values = values.transpose(1, 2)`

			`# Compute scaled dot-product attention (aka self-attention) with a causal mask`
			`attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head`
remove redundant unsqueeze in mask 2024-03-09 17:42:25 -06:00
add code backbone ch04 2024-01-29 08:13:52 -06:00			`# Original mask truncated to the number of tokens and converted to boolean`
			`mask_bool = self.mask.bool()[:num_tokens, :num_tokens]`
remove redundant unsqueeze in mask 2024-03-09 17:42:25 -06:00
			`# Use the mask to fill attention scores`
			`attn_scores.masked_fill_(mask_bool, -torch.inf)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00
add code backbone ch04 2024-01-29 08:13:52 -06:00			`attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)`
			`attn_weights = self.dropout(attn_weights)`

			`# Shape: (b, num_tokens, num_heads, head_dim)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`context_vec = (attn_weights @ values).transpose(1, 2)`

add code backbone ch04 2024-01-29 08:13:52 -06:00			`# Combine heads, where self.d_out = self.num_heads * self.head_dim`
			`context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)`
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`context_vec = self.out_proj(context_vec) # optional projection`
add code backbone ch04 2024-01-29 08:13:52 -06:00
Update pep8 (#78) * simplify requirements file * style * apply linter 2024-03-18 08:16:17 -05:00			`return context_vec`