{ "cells": [ { "cell_type": "markdown", "id": "6e2a4891-c257-4d6b-afb3-e8fef39d0437", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "\n", "
\n", "\n", "Supplementary code for the Build a Large Language Model From Scratch book by Sebastian Raschka
\n", "
Code repository: https://github.com/rasbt/LLMs-from-scratch\n", "
\n", "
\n", "\n", "
\n" ] }, { "cell_type": "markdown", "id": "6f678e62-7bcb-4405-86ae-dce94f494303", "metadata": {}, "source": [ "# The Main Data Loading Pipeline Summarized" ] }, { "cell_type": "markdown", "id": "070000fc-a7b7-4c56-a2c0-a938d413a790", "metadata": {}, "source": [ "The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).\n", "\n", "This notebook contains the main takeaway, the data loading pipeline without the intermediate steps." ] }, { "cell_type": "markdown", "id": "2b4e8f2d-cb81-41a3-8780-a70b382e18ae", "metadata": {}, "source": [ "Packages that are being used in this notebook:" ] }, { "cell_type": "code", "execution_count": 1, "id": "c7ed6fbe-45ac-40ce-8ea5-4edb212565e1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch version: 2.4.0\n", "tiktoken version: 0.7.0\n" ] } ], "source": [ "# NBVAL_SKIP\n", "from importlib.metadata import version\n", "\n", "print(\"torch version:\", version(\"torch\"))\n", "print(\"tiktoken version:\", version(\"tiktoken\"))" ] }, { "cell_type": "code", "execution_count": 2, "id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e", "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "import torch\n", "from torch.utils.data import Dataset, DataLoader\n", "\n", "\n", "class GPTDatasetV1(Dataset):\n", " def __init__(self, txt, tokenizer, max_length, stride):\n", " self.input_ids = []\n", " self.target_ids = []\n", "\n", " # Tokenize the entire text\n", " token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n", "\n", " # Use a sliding window to chunk the book into overlapping sequences of max_length\n", " for i in range(0, len(token_ids) - max_length, stride):\n", " input_chunk = token_ids[i:i + max_length]\n", " target_chunk = token_ids[i + 1: i + max_length + 1]\n", " self.input_ids.append(torch.tensor(input_chunk))\n", " self.target_ids.append(torch.tensor(target_chunk))\n", "\n", " def __len__(self):\n", " return len(self.input_ids)\n", "\n", " def __getitem__(self, idx):\n", " return self.input_ids[idx], self.target_ids[idx]\n", "\n", "\n", "def create_dataloader_v1(txt, batch_size, max_length, stride,\n", " shuffle=True, drop_last=True, num_workers=0):\n", " # Initialize the tokenizer\n", " tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", " # Create dataset\n", " dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n", "\n", " # Create dataloader\n", " dataloader = DataLoader(\n", " dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)\n", "\n", " return dataloader\n", "\n", "\n", "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n", " raw_text = f.read()\n", "\n", "vocab_size = 50257\n", "output_dim = 256\n", "context_length = 1024\n", "\n", "\n", "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n", "pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n", "\n", "batch_size = 8\n", "max_length = 4\n", "dataloader = create_dataloader_v1(\n", " raw_text,\n", " batch_size=batch_size,\n", " max_length=max_length,\n", " stride=max_length\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846", "metadata": {}, "outputs": [], "source": [ "for batch in dataloader:\n", " x, y = batch\n", "\n", " token_embeddings = token_embedding_layer(x)\n", " pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n", "\n", " input_embeddings = token_embeddings + pos_embeddings\n", "\n", " break" ] }, { "cell_type": "code", "execution_count": 4, "id": "d3664332-e6bb-447e-8b96-203aafde8b24", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([8, 4, 256])\n" ] } ], "source": [ "print(input_embeddings.shape)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }