2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								{
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "cells": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-19 09:26:26 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "d95f841a-63c9-41d4-aea1-496b3d2024dd",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-24 07:20:37 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<table style=\"width:100%\">\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<tr>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<td style=\"vertical-align:middle; text-align:left;\">\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<font size=\"2\">\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "Supplementary code for the <a href=\"http://mng.bz/orYv\">Build a Large Language Model From Scratch</a> book by <a href=\"https://sebastianraschka.com\">Sebastian Raschka</a><br>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<br>Code repository: <a href=\"https://github.com/rasbt/LLMs-from-scratch\">https://github.com/rasbt/LLMs-from-scratch</a>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "</font>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "</td>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<td style=\"vertical-align:middle; text-align:left;\">\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "<a href=\"http://mng.bz/orYv\"><img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp\" width=\"100px\"></a>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "</td>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "</tr>\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "</table>\n"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-19 09:26:26 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "25aa40e3-5109-433f-9153-f5770531fe94",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2025-02-28 10:16:21 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# Chapter 2: Working with Text Data"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-01-01 19:41:18 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "76d5d2c0-cba8-404e-9bf3-71a218cae3cf",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "Packages that are being used in this notebook:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "execution_count": 1,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "4d1305cf-12d5-46fe-a2c9-36fb71c5b3d3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2025-02-04 20:16:07 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "torch version: 2.5.1\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-06-25 21:09:21 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "tiktoken version: 0.7.0\n"
							 
						 
					
						
							
								
									
										
										
										
											2024-01-01 19:41:18 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "from importlib.metadata import version\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"torch version:\", version(\"torch\"))\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"tiktoken version:\", version(\"tiktoken\"))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "5a42fbfd-e3c2-43c2-bc12-f5f870a0b10a",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- This chapter covers data preparation and sampling to get input data \"ready\" for the LLM"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "628b2922-594d-4ff9-bd82-04f1ebdf41f5",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-30 09:43:51 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/01.webp?timestamp=1\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "2417139b-2357-44d2-bd67-23f5d7f52ae7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "## 2.1 Understanding word embeddings"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "0b6816ae-e927-43a9-b4dd-e47a9b0e1cf6",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- No code in this section"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "4f69dab7-a433-427a-9e5b-b981062d6296",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-29 09:25:52 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- There are many forms of embeddings; we focus on text embeddings in this book"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "ba08d16f-f237-4166-bf89-0e9fe703e7b4",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/02.webp\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "288c4faf-b93a-4616-9276-7a4aa4b5e9ba",
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-29 09:25:52 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- LLMs work with embeddings in high-dimensional spaces (i.e., thousands of dimensions)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Since we can't visualize such high-dimensional spaces (we humans think in 1, 2, or 3 dimensions), the figure below illustrates a 2-dimensional embedding space"
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "d6b80160-1f10-4aad-a85e-9c79444de9e6",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/03.webp\" width=\"300px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "eddbb984-8d23-40c5-bbfa-c3c379e7eec3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "## 2.2 Tokenizing text"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "f9c90731-7dc9-4cd3-8c4a-488e33b48e80",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- In this section, we tokenize text, which means breaking text into smaller units, such as individual words and punctuation characters"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "09872fdb-9d4e-40c4-949d-52a01a43ec4b",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/04.webp\" width=\"300px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "8cceaa18-833d-46b6-b211-b20c53902805",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Load raw text we want to work with\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-01-26 00:43:57 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- [The Verdict by Edith Wharton](https://en.wikisource.org/wiki/The_Verdict) is a public domain short story"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-01-17 07:50:57 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 2,
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "40f9d9b1-6d32-485a-825a-a95392a86d79",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import os\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import urllib.request\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "if not os.path.exists(\"the-verdict.txt\"):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    url = (\"https://raw.githubusercontent.com/rasbt/\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "           \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "           \"the-verdict.txt\")\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    file_path = \"the-verdict.txt\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    urllib.request.urlretrieve(url, file_path)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-10-19 16:27:19 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "56488f2c-a2b8-49f1-aaeb-461faad08dce",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- (If you encounter an `ssl.SSLCertVerificationError` when executing the previous code cell, it might be due to using an outdated Python version; you can find [more information here on GitHub](https://github.com/rasbt/LLMs-from-scratch/pull/403))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "execution_count": 3,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "8a769e87-470a-48b9-8bdb-12841b416198",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "Total number of character: 20479\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								      "I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no \n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "    raw_text = f.read()\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    \n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(\"Total number of character:\", len(raw_text))\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "print(raw_text[:99])"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "9b971a46-ac03-4368-88ae-3f20279e8f4e",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- The goal is to tokenize and embed this text for an LLM\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Let's develop a simple tokenizer based on some simple sample text that we can then later apply to the text above\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- The following regular expression will split on whitespaces"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 4,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "737dd5b0-9dbb-4a97-9ae4-3482c8c04be7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import re\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "text = \"Hello, world. This, is a test.\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "result = re.split(r'(\\s)', text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(result)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a8c40c18-a9d5-4703-bf71-8261dbcc5ee3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- We don't only want to split on whitespaces but also commas and periods, so let's modify the regular expression to do that as well"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 5,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "ea02489d-01f9-4247-b7dd-a0d63f62ef07",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "result = re.split(r'([,.]|\\s)', text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(result)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "461d0c86-e3af-4f87-8fae-594a9ca9b6ad",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- As we can see, this creates empty strings, let's remove them"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 6,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "4d8a6fb7-2e62-4a12-ad06-ccb04f25fed7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# Strip whitespace from each item and then filter out any empty strings.\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-07 07:52:24 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "result = [item for item in result if item.strip()]\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "print(result)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "250e8694-181e-496f-895d-7cb7d92b5562",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-12-17 16:01:43 +01:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- This looks pretty good, but let's also handle other types of punctuation, such as periods, question marks, and so on"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 7,
							 
						 
					
						
							
								
									
										
										
										
											2024-03-23 06:50:34 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "text = \"Hello, world. Is this-- a test?\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-23 06:50:34 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "result = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "result = [item.strip() for item in result if item.strip()]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(result)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "5bbea70b-c030-45d9-b09d-4318164c0bb4",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- This is pretty good, and we are now ready to apply this tokenization to the raw text"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "6cbe9330-b587-4262-be9f-497a84ec0e8a",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/05.webp\" width=\"350px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 8,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "8c567caa-8ff5-49a8-a5cc-d365b0a78a99",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:15:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', raw_text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(preprocessed[:30])"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "e2a19e1a-5105-4ddb-812a-b7d3117eab95",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Let's calculate the total number of tokens"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 9,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "35db7b5e-510b-4c45-995f-f5ad64a8e19c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:15:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "4690\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(len(preprocessed))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "0b5ce8fe-3a07-4f2a-90f1-a0321ce3a231",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "## 2.3 Converting tokens into token IDs"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a5204973-f414-4c0d-87b0-cfec1f06e6ff",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Next, we convert the text tokens into token IDs that we can process via embedding layers later"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "177b041d-f739-43b8-bd81-0443ae3a7f8d",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/06.webp\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "b5973794-7002-4202-8b12-0900cd779720",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- From these tokens, we can now build a vocabulary that consists of all the unique tokens"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 10,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "7fdf0533-5ab6-42a5-83fa-a3b045de6396",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:15:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "1130\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-16 20:16:25 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "all_words = sorted(set(preprocessed))\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "vocab_size = len(all_words)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(vocab_size)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 11,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "77d00d96-881f-4691-bb03-84fec2a75a26",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "vocab = {token:integer for integer,token in enumerate(all_words)}"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "75bd1f81-3a8f-4dd9-9dd6-e75f32dacbe3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Below are the first 50 entries in this vocabulary:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 12,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "e1c5de4a-aa4e-4aec-b532-10bb364039d6",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('!', 0)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('\"', 1)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "(\"'\", 2)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('(', 3)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "(')', 4)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "(',', 5)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('--', 6)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('.', 7)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "(':', 8)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "(';', 9)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('?', 10)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('A', 11)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Ah', 12)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Among', 13)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('And', 14)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Are', 15)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Arrt', 16)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('As', 17)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('At', 18)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Be', 19)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Begin', 20)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Burlington', 21)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('But', 22)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('By', 23)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Carlo', 24)\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:15:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "('Chicago', 25)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Claude', 26)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Come', 27)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Croft', 28)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Destroyed', 29)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Devonshire', 30)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Don', 31)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Dubarry', 32)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Emperors', 33)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Florence', 34)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('For', 35)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Gallery', 36)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Gideon', 37)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Gisburn', 38)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Gisburns', 39)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Grafton', 40)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Greek', 41)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Grindle', 42)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Grindles', 43)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('HAD', 44)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Had', 45)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Hang', 46)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Has', 47)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('He', 48)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Her', 49)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('Hermia', 50)\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "for i, item in enumerate(vocab.items()):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    print(item)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "    if i >= 50:\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "        break"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "3b1dc314-351b-476a-9459-0ec9ddc29b19",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Below, we illustrate the tokenization of a short sample text using a small vocabulary:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "67407a9f-0202-4e7c-9ed7-1b3154191ebc",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-07-02 17:12:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/07.webp?123\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "4e569647-2589-4c9d-9a5c-aef1c88a0a9a",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Putting it now all together into a tokenizer class"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 13,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "f531bf46-7c25-4ef8-bff8-0d27518676d5",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "class SimpleTokenizerV1:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def __init__(self, vocab):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        self.str_to_int = vocab\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        self.int_to_str = {i:s for s,i in vocab.items()}\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def encode(self, text):\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-07-04 13:27:27 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "                                \n",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        preprocessed = [\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            item.strip() for item in preprocessed if item.strip()\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        ]\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "        ids = [self.str_to_int[s] for s in preprocessed]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        return ids\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def decode(self, ids):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        text = \" \".join([self.int_to_str[i] for i in ids])\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        # Replace spaces before the specified punctuations\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        text = re.sub(r'\\s+([,.?!\"()\\'])', r'\\1', text)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        return text"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "dee7a1e5-b54f-4ca1-87ef-3d663c4ee1e7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- The `encode` function turns text into token IDs\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- The `decode` function turns token IDs back into text"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "cc21d347-ec03-4823-b3d4-9d686e495617",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-07-02 17:12:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/08.webp?123\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "c2950a94-6b0d-474e-8ed0-66d0c3c1a95c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- We can use the tokenizer to encode (that is, tokenize) texts into integers\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- These integers can then be embedded (later) as input of/for the LLM"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 14,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "647364ec-7995-4654-9b4a-7607ccf5f1e4",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:15:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer = SimpleTokenizerV1(vocab)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "text = \"\"\"\"It's the last he painted, you know,\" \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "           Mrs. Gisburn said with pardonable pride.\"\"\"\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "ids = tokenizer.encode(text)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(ids)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "3201706e-a487-4b60-b99d-5765865f29a0",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- We can decode the integers back into text"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 15,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "01d8c8fb-432d-4a49-b332-99f23b233746",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "data": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "text/plain": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								       "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     },
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "execution_count": 15,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "execute_result"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer.decode(ids)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
									
										
										
										
											2024-01-17 07:50:57 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 16,
							 
						 
					
						
							
								
									
										
										
										
											2024-01-17 07:50:57 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "54f6aa8b-9827-412e-9035-e827296ab0fe",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2024-01-17 07:50:57 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "data": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "text/plain": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								       "'\" It\\' s the last he painted, you know,\" Mrs. Gisburn said with pardonable pride.'"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     },
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "execution_count": 16,
							 
						 
					
						
							
								
									
										
										
										
											2024-01-17 07:50:57 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "execute_result"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer.decode(tokenizer.encode(text))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "4b821ef8-4d53-43b6-a2b2-aef808c343c7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## 2.4 Adding special context tokens"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "863d6d15-a3e2-44e0-b384-bb37f17cf443",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- It's useful to add some \"special\" tokens for unknown words and to denote the end of a text"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "aa7fc96c-e1fd-44fb-b7f5-229d7c7922a4",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-07-02 17:12:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/09.webp?123\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "9d709d57-2486-4152-b7f9-d3e4bd8634cd",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Some tokenizers use special tokens to help the LLM with additional context\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Some of these special tokens are\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "  - `[BOS]` (beginning of sequence) marks the beginning of text\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "  - `[EOS]` (end of sequence) marks where the text ends (this is usually used to concatenate multiple unrelated texts, e.g., two different Wikipedia articles or two different books, and so on)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "  - `[PAD]` (padding) if we train LLMs with a batch size greater than 1 (we may include multiple texts with different lengths; with the padding token we pad the shorter texts to the longest length so that all texts have an equal length)\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-07-17 21:34:51 +09:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- `[UNK]` to represent words that are not included in the vocabulary\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Note that GPT-2 does not need any of these tokens mentioned above but only uses an `<|endoftext|>` token to reduce complexity\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-06-26 00:30:30 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- The `<|endoftext|>` is analogous to the `[EOS]` token mentioned above\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "- GPT also uses the `<|endoftext|>` for padding (since we typically use a mask when training on batched inputs, we would not attend padded tokens anyways, so it does not matter what these tokens are)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- GPT-2 does not use an `<UNK>` token for out-of-vocabulary words; instead, GPT-2 uses a byte-pair encoding (BPE) tokenizer, which breaks down words into subword units which we will discuss in a later section\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a336b43b-7173-49e7-bd80-527ad4efb271",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- We use the `<|endoftext|>` tokens between two independent sources of text:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "52442951-752c-4855-9752-b121a17fef55",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/10.webp\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "c661a397-da06-4a86-ac27-072dbe7cb172",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Let's see what happens if we tokenize the following text:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 17,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "d5767eff-440c-4de1-9289-f789349d6b85",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "ename": "KeyError",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "evalue": "'Hello'",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "error",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "traceback": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
							 
						 
					
						
							
								
									
										
										
										
											2024-06-25 21:09:21 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "Cell \u001b[0;32mIn[17], line 5\u001b[0m\n\u001b[1;32m      1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m SimpleTokenizerV1(vocab)\n\u001b[1;32m      3\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHello, do you like tea. Is this-- a test?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 5\u001b[0m tokenizer\u001b[38;5;241m.\u001b[39mencode(text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-07-04 13:27:27 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "Cell \u001b[0;32mIn[13], line 12\u001b[0m, in \u001b[0;36mSimpleTokenizerV1.encode\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m      7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.:;?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m      9\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m     10\u001b[0m     item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m     11\u001b[0m ]\n\u001b[0;32m---> 12\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Cell \u001b[0;32mIn[13], line 12\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      7\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m([,.:;?_!\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m()\u001b[39m\u001b[38;5;130;01m\\'\u001b[39;00m\u001b[38;5;124m]|--|\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms)\u001b[39m\u001b[38;5;124m'\u001b[39m, text)\n\u001b[1;32m      9\u001b[0m preprocessed \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m     10\u001b[0m     item\u001b[38;5;241m.\u001b[39mstrip() \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m preprocessed \u001b[38;5;28;01mif\u001b[39;00m item\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m     11\u001b[0m ]\n\u001b[0;32m---> 12\u001b[0m ids \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstr_to_int[s] \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m preprocessed]\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ids\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								      "\u001b[0;31mKeyError\u001b[0m: 'Hello'"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer = SimpleTokenizerV1(vocab)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "text = \"Hello, do you like tea. Is this-- a test?\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer.encode(text)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "dc53ee0c-fe2b-4cd8-a946-5471f7651acf",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- The above produces an error because the word \"Hello\" is not contained in the vocabulary\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- To deal with such cases, we can add special tokens like `\"<|unk|>\"` to the vocabulary to represent unknown words\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Since we are already extending the vocabulary, let's add another token called `\"<|endoftext|>\"` which is used in GPT-2 training to denote the end of a text (and it's also used between concatenated text, like if our training datasets consists of multiple articles, books, etc.)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 18,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-01-10 08:01:19 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "all_tokens = sorted(list(set(preprocessed)))\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "all_tokens.extend([\"<|endoftext|>\", \"<|unk|>\"])\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "vocab = {token:integer for integer,token in enumerate(all_tokens)}"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 19,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "data": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "text/plain": [
							 
						 
					
						
							
								
									
										
										
										
											2024-06-25 21:44:19 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								       "1132"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								      ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     },
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "execution_count": 19,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "execute_result"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "len(vocab.items())"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 20,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-06-25 21:44:19 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "('younger', 1127)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('your', 1128)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('yourself', 1129)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('<|endoftext|>', 1130)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "('<|unk|>', 1131)\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "for i, item in enumerate(list(vocab.items())[-5:]):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    print(item)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a1daa2b0-6e75-412b-ab53-1f6fb7b4d453",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-02-03 03:41:31 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- We also need to adjust the tokenizer accordingly so that it knows when and how to use the new `<unk>` token"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 21,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "948861c5-3f30-4712-a234-725f20d26f68",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "class SimpleTokenizerV2:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def __init__(self, vocab):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        self.str_to_int = vocab\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        self.int_to_str = { i:s for s,i in vocab.items()}\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def encode(self, text):\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-07-04 13:27:27 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "        preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        preprocessed = [\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            item if item in self.str_to_int \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            else \"<|unk|>\" for item in preprocessed\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        ]\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        ids = [self.str_to_int[s] for s in preprocessed]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        return ids\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def decode(self, ids):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        text = \" \".join([self.int_to_str[i] for i in ids])\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        # Replace spaces before the specified punctuations\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-07-05 08:34:27 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        text = re.sub(r'\\s+([,.:;?!\"()\\'])', r'\\1', text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "        return text"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "aa728dd1-9d35-4ac7-938f-d411d73083f6",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "Let's try to tokenize text with the modified tokenizer:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 22,
							 
						 
					
						
							
								
									
										
										
										
											2024-07-05 08:34:27 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "4133c502-18ac-4412-9f43-01caf4efa3dc",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer = SimpleTokenizerV2(vocab)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "text1 = \"Hello, do you like tea?\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "text2 = \"In the sunlit terraces of the palace.\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "text = \" <|endoftext|> \".join((text1, text2))\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(text)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 23,
							 
						 
					
						
							
								
									
										
										
										
											2024-07-05 08:34:27 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "7ed395fe-dc1b-4ed2-b85b-457cc35aab60",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2024-03-04 06:42:58 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "data": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "text/plain": [
							 
						 
					
						
							
								
									
										
										
										
											2024-06-25 21:44:19 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								       "[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-04 06:42:58 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     },
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "execution_count": 23,
							 
						 
					
						
							
								
									
										
										
										
											2024-03-04 06:42:58 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "execute_result"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer.encode(text)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 24,
							 
						 
					
						
							
								
									
										
										
										
											2024-07-05 08:34:27 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "059367f9-7a60-4c0d-8a00-7c4c766d0ebc",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
									
										
										
										
											2024-03-04 06:42:58 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "data": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "text/plain": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								       "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     },
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "execution_count": 24,
							 
						 
					
						
							
								
									
										
										
										
											2024-03-04 06:42:58 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "execute_result"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer.decode(tokenizer.encode(text))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "5c4ba34b-170f-4e71-939b-77aabb776f14",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## 2.5 BytePair encoding"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "2309494c-79cf-4a2d-bc28-a94d602f050e",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- GPT-2 used BytePair encoding (BPE) as its tokenizer\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- For instance, if GPT-2's vocabulary doesn't have the word \"unfamiliarword,\" it might tokenize it as [\"unfam\", \"iliar\", \"word\"] or some other subword breakdown, depending on its trained BPE merges\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- The original BPE tokenizer can be found here: [https://github.com/openai/gpt-2/blob/master/src/encoder.py](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- In this chapter, we are using the BPE tokenizer from OpenAI's open-source [tiktoken](https://github.com/openai/tiktoken) library, which implements its core algorithms in Rust to improve computational performance\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-11 10:52:56 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- I created a notebook in the [./bytepair_encoder](../02_bonus_bytepair-encoder) that compares these two implementations side-by-side (tiktoken was about 5x faster on the sample text)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 25,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "ede1d41f-934b-4bf4-8184-54394a257a94",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# pip install tiktoken"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 26,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "48967a77-7d17-42bf-9e92-fc619d63a59e",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-06-25 21:09:21 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "tiktoken version: 0.7.0\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import importlib\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import tiktoken\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 27,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "tokenizer = tiktoken.get_encoding(\"gpt2\")"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 28,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "5ff2cd85-7cfb-4325-b390-219938589428",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "text = (\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    \"Hello, do you like tea? <|endoftext|> In the sunlit terraces\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "     \"of someunknownPlace.\"\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ")\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "integers = tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(integers)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 29,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "strings = tokenizer.decode(integers)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(strings)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "e8c2e7b4-6a22-42aa-8e4d-901f06378d4a",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- BPE tokenizers break down unknown words into subwords and individual characters:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "c082d41f-33d7-4827-97d8-993d5a84bb3c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/11.webp\" width=\"300px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## 2.6 Data sampling with a sliding window"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "509d9826-6384-462e-aa8a-a7c73cd6aad0",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- We train LLMs to generate one word at a time, so we want to prepare the training data accordingly where the next word in a sequence represents the target to predict:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "39fb44f4-0c43-4a6a-9c2f-9cf31452354c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/12.webp\" width=\"400px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 30,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "5145\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "    raw_text = f.read()\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "enc_text = tokenizer.encode(raw_text)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "print(len(enc_text))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "cebd0657-5543-43ca-8011-2ae6bd0a5810",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- For each text chunk, we want the inputs and targets\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 31,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "e84424a7-646d-45b6-99e3-80d15fb761f2",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "enc_sample = enc_text[50:]"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 32,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "dfbff852-a92f-48c8-a46d-143a0f109f40",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "x: [290, 4920, 2241, 287]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "y:      [4920, 2241, 287, 257]\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "context_size = 4\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "x = enc_sample[:context_size]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "y = enc_sample[1:context_size+1]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(f\"x: {x}\")\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(f\"y:      {y}\")"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "815014ef-62f7-4476-a6ad-66e20e42b7c3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- One by one, the prediction would look like as follows:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 33,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "[290] ----> 4920\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "[290, 4920] ----> 2241\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "[290, 4920, 2241] ----> 287\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "[290, 4920, 2241, 287] ----> 257\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "for i in range(1, context_size+1):\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "    context = enc_sample[:i]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    desired = enc_sample[i]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    print(context, \"---->\", desired)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 34,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " and ---->  established\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " and established ---->  himself\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      " and established himself ---->  in\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " and established himself in ---->  a\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "for i in range(1, context_size+1):\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "    context = enc_sample[:i]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    desired = enc_sample[i]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    print(tokenizer.decode(context), \"---->\", tokenizer.decode([desired]))"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "210d2dd9-fc20-4927-8d3d-1466cf41aae1",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- We will take care of the next-word prediction in a later chapter after we covered the attention mechanism\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- For now, we implement a simple data loader that iterates over the input dataset and returns the inputs and targets shifted by one"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a1a1b47a-f646-49d1-bc70-fddf2c840796",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Install and import PyTorch (see Appendix A for installation tips)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 35,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "e1770134-e7f3-4725-a679-e04c3be48cac",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2025-02-04 20:16:07 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "PyTorch version: 2.5.1\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "import torch\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"PyTorch version:\", torch.__version__)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "0c9a3d50-885b-49bc-b791-9f5cc8bc7b7c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-04-22 20:40:48 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- We use a sliding window approach, changing the position by +1:\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-06-01 09:38:33 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/13.webp?123\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "92ac652d-7b38-4843-9fbd-494cdc8ec12c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Create dataset and dataloader that extract chunks from the input text dataset"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 36,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "from torch.utils.data import Dataset, DataLoader\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "class GPTDatasetV1(Dataset):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def __init__(self, txt, tokenizer, max_length, stride):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        self.input_ids = []\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        self.target_ids = []\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        # Tokenize the entire text\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-04-13 14:57:56 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-03-30 16:01:37 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        assert len(token_ids) > max_length, \"Number of tokenized inputs must at least be equal to max_length+1\"\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        for i in range(0, len(token_ids) - max_length, stride):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            input_chunk = token_ids[i:i + max_length]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            self.input_ids.append(torch.tensor(input_chunk))\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "            self.target_ids.append(torch.tensor(target_chunk))\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def __len__(self):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        return len(self.input_ids)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    def __getitem__(self, idx):\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        return self.input_ids[idx], self.target_ids[idx]"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 37,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "def create_dataloader_v1(txt, batch_size=4, max_length=256, \n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "                         stride=128, shuffle=True, drop_last=True,\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "                         num_workers=0):\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-02-25 07:23:38 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "    # Initialize the tokenizer\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    # Create dataset\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    # Create dataloader\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-02-25 07:23:38 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "    dataloader = DataLoader(\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-04-13 14:57:56 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        dataset,\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        batch_size=batch_size,\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        shuffle=shuffle,\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "        drop_last=drop_last,\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-06-20 00:36:46 +02:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "        num_workers=num_workers\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-04-13 14:57:56 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "    )\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    return dataloader"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "42dd68ef-59f7-45ff-ba44-e311c899ddcd",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Let's test the dataloader with a batch size of 1 for an LLM with a context size of 4:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 38,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "df31d96c-6bfd-4564-a956-6192242d7579",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "    raw_text = f.read()"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 39,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "dataloader = create_dataloader_v1(\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ")\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "data_iter = iter(dataloader)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "first_batch = next(data_iter)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(first_batch)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 40,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "second_batch = next(data_iter)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(second_batch)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "b006212f-de45-468d-bdee-5806216d1679",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- An example using stride equal to the context length (here: 4) as shown below:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "9cb467e0-bdcd-4dda-b9b0-a738c5d33ac3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/14.webp\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "b1ae6d45-f26e-4b83-9c7b-cff55ffa7d16",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- We can also create batched outputs\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Note that we increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 41,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Inputs:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " tensor([[   40,   367,  2885,  1464],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-02-03 08:50:56 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [ 1807,  3619,   402,   271],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [10899,  2138,   257,  7026],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [15632,   438,  2016,   257],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [  922,  5891,  1576,   438],\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								      "        [  568,   340,   373,   645],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-02-03 08:50:56 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [ 1049,  5975,   284,   502],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [  284,  3285,   326,    11]])\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								      "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Targets:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " tensor([[  367,  2885,  1464,  1807],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-02-03 08:50:56 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [ 3619,   402,   271, 10899],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [ 2138,   257,  7026, 15632],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [  438,  2016,   257,   922],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [ 5891,  1576,   438,   568],\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								      "        [  340,   373,   645,  1049],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-02-03 08:50:56 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [ 5975,   284,   502,   284],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [ 3285,   326,    11,   287]])\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-02-03 08:50:56 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "data_iter = iter(dataloader)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "inputs, targets = next(data_iter)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"Inputs:\\n\", inputs)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"\\nTargets:\\n\", targets)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "2cd2fcda-2fda-4aa8-8bc8-de1e496f9db1",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## 2.7 Creating token embeddings"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "1a301068-6ab2-44ff-a915-1ba11688274f",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- The data is already almost ready for an LLM\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- But lastly let us embed the tokens in a continuous vector representation using an embedding layer\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Usually, these embedding layers are part of the LLM itself and are updated (trained) during model training"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "e85089aa-8671-4e5f-a2b3-ef252004ee4c",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/15.webp\" width=\"400px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "44e014ca-1fc5-4b90-b6fa-c2097bb92c0b",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-07-04 13:27:27 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Suppose we have the following four input examples with input ids 2, 3, 5, and 1 (after tokenization):"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 42,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "15a6304c-9474-4470-b85d-3991a49fa653",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "input_ids = torch.tensor([2, 3, 5, 1])"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "14da6344-2c71-4837-858d-dd120005ba05",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- For the sake of simplicity, suppose we have a small vocabulary of only 6 words and we want to create embeddings of size 3:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 43,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "vocab_size = 6\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "output_dim = 3\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "torch.manual_seed(123)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "embedding_layer = torch.nn.Embedding(vocab_size, output_dim)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "4ff241f6-78eb-4e4a-a55f-5b2b6196d5b0",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- This would result in a 6x3 weight matrix:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 44,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "a686eb61-e737-4351-8f1c-222913d47468",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Parameter containing:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "tensor([[ 0.3374, -0.1778, -0.1690],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [ 0.9178,  1.5810,  1.3010],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [ 1.2753, -0.2010, -0.1606],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [-0.4015,  0.9666, -1.1481],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [-1.1589,  0.3255, -0.6315],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [-2.8400, -0.7849, -1.4096]], requires_grad=True)\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(embedding_layer.weight)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "26fcf4f5-0801-4eb4-bb90-acce87935ac7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-11 10:52:56 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- For those who are familiar with one-hot encoding, the embedding layer approach above is essentially just a more efficient way of implementing one-hot encoding followed by matrix multiplication in a fully-connected layer, which is described in the supplementary code in [./embedding_vs_matmul](../03_bonus_embedding-vs-matmul)\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "- Because the embedding layer is just a more efficient implementation that is equivalent to the one-hot encoding and matrix-multiplication approach it can be seen as a neural network layer that can be optimized via backpropagation"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "4b0d58c3-83c0-4205-aca2-9c48b19fd4a7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- To convert a token with id 3 into a 3-dimensional vector, we do the following:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 45,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(embedding_layer(torch.tensor([3])))"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a7bbf625-4f36-491d-87b4-3969efb784b0",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-02-09 04:18:07 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- Note that the above is the 4th row in the `embedding_layer` weight matrix\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 23:40:44 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- To embed all four `input_ids` values above, we do"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 46,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "50280ead-0363-44c8-8c35-bb885d92c8b7",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "tensor([[ 1.2753, -0.2010, -0.1606],\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [-0.4015,  0.9666, -1.1481],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [-2.8400, -0.7849, -1.4096],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)\n"
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(embedding_layer(input_ids))"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
									
										
										
										
											2024-04-03 03:37:45 +08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "be97ced4-bd13-42b7-866a-4d699a17e155",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- An embedding layer is essentially a look-up operation:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "f33c2741-bf1b-4c60-b7fd-61409d556646",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-07-16 17:51:15 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/16.webp?123\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "08218d9f-aa1a-4afb-a105-72ff96a54e73",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-04-12 19:08:34 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- **You may be interested in the bonus content comparing embedding layers with regular linear layers: [../03_bonus_embedding-vs-matmul](../03_bonus_embedding-vs-matmul)**"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "c393d270-b950-4bc8-99ea-97d74f2ea0f6",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-02 06:48:16 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "## 2.8 Encoding word positions"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "24940068-1099-4698-bdc0-e798515e2902",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Embedding layer convert IDs into identical vector representations regardless of where they are located in the input sequence:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "9e0b14a2-f3f3-490e-b513-f262dbcf94fa",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/17.webp\" width=\"400px\">"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "92a7d7fe-38a5-46e6-8db6-b688887b0430",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Positional embeddings are combined with the token embedding vector to form the input embeddings for a large language model:"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "48de37db-d54d-45c4-ab3e-88c0783ad2e4",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/18.webp\" width=\"500px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "7f187f87-c1f8-4c2e-8050-350bbb972f55",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- The BytePair encoder has a vocabulary size of 50,257:\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "- Suppose we want to encode the input tokens into a 256-dimensional vector representation:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 47,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "vocab_size = 50257\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "output_dim = 256\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "a2654722-24e4-4b0d-a43c-436a461eb70b",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "- If we have a batch size of 8 with 4 tokens each, this results in a 8 x 4 x 256 tensor:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 48,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "max_length = 4\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-18 11:03:42 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "dataloader = create_dataloader_v1(\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    raw_text, batch_size=8, max_length=max_length,\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "    stride=max_length, shuffle=False\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ")\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    "data_iter = iter(dataloader)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "inputs, targets = next(data_iter)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 49,
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "84416b60-3707-4370-bcbc-da0b62f2b64d",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Token IDs:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " tensor([[   40,   367,  2885,  1464],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-03 19:37:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [ 1807,  3619,   402,   271],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [10899,  2138,   257,  7026],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [15632,   438,  2016,   257],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [  922,  5891,  1576,   438],\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [  568,   340,   373,   645],\n",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-03 19:37:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "        [ 1049,  5975,   284,   502],\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "        [  284,  3285,   326,    11]])\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								      "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "Inputs shape:\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      " torch.Size([8, 4])\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"Token IDs:\\n\", inputs)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "print(\"\\nInputs shape:\\n\", inputs.shape)"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 50,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "7766ec38-30d0-4128-8c31-f49f063c43d1",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "torch.Size([8, 4, 256])\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "token_embeddings = token_embedding_layer(inputs)\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-01-13 12:44:06 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(token_embeddings.shape)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# uncomment & execute the following line to see how the embeddings look like\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-02-04 20:16:07 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "# print(token_embeddings)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "fe2ae164-6f19-4e32-b9e5-76950fcf1c9f",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "- GPT-2 uses absolute position embeddings, so we just create another embedding layer:"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 51,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-04-04 07:27:41 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "context_length = max_length\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-01-13 12:44:06 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# uncomment & execute the following line to see how the embedding layer weights look like\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# print(pos_embedding_layer.weight)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 52,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "c369a1e7-d566-4b53-b398-d6adafb44105",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "torch.Size([4, 256])\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-01-13 12:44:06 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(pos_embeddings.shape)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# uncomment & execute the following line to see how the embeddings look like\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# print(pos_embeddings)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "870e9d9f-2935-461a-9518-6d1386b976d6",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- To create the input embeddings used in an LLM, we simply add the token and the positional embeddings:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "code",
							 
						 
					
						
							
								
									
										
										
										
											2024-05-22 20:27:09 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "execution_count": 53,
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   "id": "b22fab89-526e-43c8-9035-5b7018e34288",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "outputs": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    {
							 
						 
					
						
							
								
									
										
										
										
											2023-10-15 17:15:20 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								     "name": "stdout",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "output_type": "stream",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     "text": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      "torch.Size([8, 4, 256])\n"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								     ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "input_embeddings = token_embeddings + pos_embeddings\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-01-13 12:44:06 -08:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "print(input_embeddings.shape)\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# uncomment & execute the following line to see how the embeddings look like\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# print(input_embeddings)"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "1fbda581-6f9b-476f-8ea7-d244e6a4eaec",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- In the initial phase of the input processing workflow, the input text is segmented into separate tokens\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "- Following this segmentation, these tokens are transformed into token IDs based on a predefined vocabulary:"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "d1bb0f7e-460d-44db-b366-096adcd84fff",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-03-17 09:08:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch02_compressed/19.webp\" width=\"400px\">"
							 
						 
					
						
							
								
									
										
										
										
											2024-03-08 07:18:06 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "id": "63230f2e-258f-4497-9e2e-8deee4530364",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "# Summary and takeaways"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "cell_type": "markdown",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "id": "8b3293a6-45a5-47cd-aa00-b23e3ca0a73f",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "metadata": {},
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "source": [
							 
						 
					
						
							
								
									
										
										
										
											2024-06-29 07:33:26 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "See the [./dataloader.ipynb](./dataloader.ipynb) code notebook, which is a concise version of the data loader that we implemented in this chapter and will need for training the GPT model in upcoming chapters.\n",
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
									
										
										
										
											2025-01-17 12:22:00 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    "See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions.\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "\n",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "See the [Byte Pair Encoding (BPE) Tokenizer From Scratch](../02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb) notebook if you are interested in learning how the GPT-2 tokenizer can be implemented and trained from scratch."
							 
						 
					
						
							
								
									
										
										
										
											2023-12-08 06:30:20 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   ]
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 ],
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "metadata": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  "kernelspec": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "display_name": "Python 3 (ipykernel)",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "language": "python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "name": "python3"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  "language_info": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "codemirror_mode": {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "name": "ipython",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    "version": 3
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "file_extension": ".py",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "mimetype": "text/x-python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "name": "python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "nbconvert_exporter": "python",
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								   "pygments_lexer": "ipython3",
							 
						 
					
						
							
								
									
										
										
										
											2025-02-28 10:16:21 -06:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								   "version": "3.10.16"
							 
						 
					
						
							
								
									
										
										
										
											2023-09-28 07:08:50 -05:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								  }
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 },
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "nbformat": 4,
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								 "nbformat_minor": 5
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								}