diff --git a/ch05/01_main-chapter-code/ch05.ipynb b/ch05/01_main-chapter-code/ch05.ipynb index ae42271..9532225 100644 --- a/ch05/01_main-chapter-code/ch05.ipynb +++ b/ch05/01_main-chapter-code/ch05.ipynb @@ -34,7 +34,7 @@ "matplotlib version: 3.8.2\n", "numpy version: 1.26.0\n", "tiktoken version: 0.5.1\n", - "torch version: 2.2.1\n" + "torch version: 2.2.2\n" ] } ], @@ -1348,9 +1348,8 @@ " plt.savefig(\"loss-plot.pdf\")\n", " plt.show()\n", "\n", - "\n", "epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))\n", - "plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)\n" + "plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)" ] }, { @@ -1413,7 +1412,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 42, "id": "2734cee0-f6f9-42d5-b71c-fa7e0ef28b6d", "metadata": {}, "outputs": [ @@ -1484,7 +1483,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 43, "id": "01a5ce39-3dc8-4c35-96bc-6410a1e42412", "metadata": {}, "outputs": [ @@ -1526,7 +1525,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 44, "id": "6400572f-b3c8-49e2-95bc-433e55c5b3a1", "metadata": {}, "outputs": [ @@ -1546,7 +1545,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 45, "id": "b23b863e-252a-403c-b5b1-62bc0a42319f", "metadata": {}, "outputs": [ @@ -1598,7 +1597,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 46, "id": "0759e4c8-5362-467c-bec6-b0a19d1ba43d", "metadata": {}, "outputs": [], @@ -1616,7 +1615,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 47, "id": "2e66e613-4aca-4296-a984-ddd0d80c6578", "metadata": {}, "outputs": [ @@ -1660,7 +1659,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 48, "id": "e4600713-c51e-4f53-bf58-040a6eb362b8", "metadata": {}, "outputs": [ @@ -1693,7 +1692,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 49, "id": "9dfb48f0-bc3f-46a5-9844-33b6c9b0f4df", "metadata": {}, "outputs": [ @@ -1759,7 +1758,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 50, "id": "2a7f908a-e9ec-446a-b407-fb6dbf05c806", "metadata": {}, "outputs": [ @@ -1782,7 +1781,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 51, "id": "753865ed-79c5-48b1-b9f2-ccb132ff1d2f", "metadata": {}, "outputs": [ @@ -1790,8 +1789,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "tensor([4.5100, -inf, -inf, 6.7500, -inf, -inf, -inf, 6.2800, -inf])\n", - "torch.Size([9])\n" + "tensor([4.5100, -inf, -inf, 6.7500, -inf, -inf, -inf, 6.2800, -inf])\n" ] } ], @@ -1802,13 +1800,12 @@ " other=next_token_logits\n", ")\n", "\n", - "print(new_logits)\n", - "print(new_logits.shape)" + "print(new_logits)" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 52, "id": "4844f000-c329-4e7e-aa89-16a2c4ebee43", "metadata": {}, "outputs": [ @@ -1844,7 +1841,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 68, "id": "8e318891-bcc0-4d71-b147-33ce55febfa3", "metadata": {}, "outputs": [], @@ -1885,18 +1882,10 @@ " return idx" ] }, - { - "cell_type": "markdown", - "id": "8817c673-6d27-417c-b2c1-3cff394a340d", - "metadata": {}, - "source": [ - "- **Exercise:** What are the settings for `generate` to force deterministic behavior?" - ] - }, { "cell_type": "code", - "execution_count": 41, - "id": "3adb4df3-1150-44e2-93c7-532d205901f9", + "execution_count": 69, + "id": "aa2a0d7d-0457-42d1-ab9d-bd67683e7ed8", "metadata": {}, "outputs": [ { @@ -1904,10 +1893,9 @@ "output_type": "stream", "text": [ "Output text:\n", - " Every effort moves you?\"\n", + " Every effort moves you know terrace _not brush.\"\n", "\n", - "\"Yes--quite insensible to me, a single one in the house.\"\n", - "\n" + "\"Never a little wild in a and Mrs. G\n" ] } ], @@ -1919,13 +1907,21 @@ " idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n", " max_new_tokens=20,\n", " context_size=GPT_CONFIG_124M[\"ctx_len\"],\n", - " top_k=2,\n", - " temperature=1.25\n", + " top_k=10,\n", + " temperature=1.5\n", ")\n", "\n", "print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))" ] }, + { + "cell_type": "markdown", + "id": "8817c673-6d27-417c-b2c1-3cff394a340d", + "metadata": {}, + "source": [ + "- **Exercise:** What are the settings for `generate` to force deterministic behavior?" + ] + }, { "cell_type": "markdown", "id": "4e2002ca-f4c1-48af-9e0a-88bfc163ba0b", @@ -1954,7 +1950,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 70, "id": "3d67d869-ac04-4382-bcfb-c96d1ca80d47", "metadata": {}, "outputs": [], @@ -1972,7 +1968,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 71, "id": "9d57d914-60a3-47f1-b499-5352f4c457cb", "metadata": {}, "outputs": [], @@ -1993,7 +1989,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 72, "id": "bbd175bb-edf4-450e-a6de-d3e8913c6532", "metadata": {}, "outputs": [], @@ -2008,13 +2004,17 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 73, "id": "8a0c7295-c822-43bf-9286-c45abc542868", "metadata": {}, "outputs": [], "source": [ "checkpoint = torch.load(\"model_and_optimizer.pth\")\n", + "\n", + "model = GPTModel(GPT_CONFIG_124M)\n", "model.load_state_dict(checkpoint[\"model_state_dict\"])\n", + "\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)\n", "optimizer.load_state_dict(checkpoint[\"optimizer_state_dict\"])\n", "model.train();" ] @@ -2057,7 +2057,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 74, "id": "fb9fdf02-972a-444e-bf65-8ffcaaf30ce8", "metadata": {}, "outputs": [], @@ -2067,7 +2067,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 75, "id": "a0747edc-559c-44ef-a93f-079d60227e3f", "metadata": {}, "outputs": [ @@ -2087,105 +2087,12 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 84, "id": "c5bc89eb-4d39-4287-9b0c-e459ebe7f5ed", "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import requests\n", - "import json\n", - "import numpy as np\n", - "import tensorflow as tf\n", - "from tqdm import tqdm\n", - "\n", - "\n", - "\n", - "def download_and_load_gpt2(model_size, models_dir):\n", - " # Validate model size\n", - " allowed_sizes = (\"124M\", \"355M\", \"774M\", \"1558M\")\n", - " if model_size not in allowed_sizes:\n", - " raise ValueError(f\"Model size not in {allowed_sizes}\")\n", - "\n", - " # Define paths\n", - " model_dir = os.path.join(models_dir, model_size)\n", - " base_url = \"https://openaipublic.blob.core.windows.net/gpt-2/models\"\n", - " filenames = [\n", - " \"checkpoint\", \"encoder.json\", \"hparams.json\",\n", - " \"model.ckpt.data-00000-of-00001\", \"model.ckpt.index\",\n", - " \"model.ckpt.meta\", \"vocab.bpe\"\n", - " ]\n", - "\n", - " # Download files\n", - " os.makedirs(model_dir, exist_ok=True)\n", - " for filename in filenames:\n", - " file_url = os.path.join(base_url, model_size, filename)\n", - " file_path = os.path.join(model_dir, filename)\n", - " download_file(file_url, file_path)\n", - "\n", - " # Load hparams and params\n", - " tf_ckpt_path = tf.train.latest_checkpoint(model_dir)\n", - " hparams = json.load(open(os.path.join(model_dir, \"hparams.json\")))\n", - " params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)\n", - "\n", - " return hparams, params\n", - "\n", - "\n", - "def download_file(url, destination):\n", - " # Send a GET request to download the file in streaming mode\n", - " response = requests.get(url, stream=True)\n", - "\n", - " # Get the total file size from headers, defaulting to 0 if not present\n", - " file_size = int(response.headers.get(\"content-length\", 0))\n", - "\n", - " # Check if file exists and has the same size\n", - " if os.path.exists(destination):\n", - " file_size_local = os.path.getsize(destination)\n", - " if file_size == file_size_local:\n", - " print(f\"File already exists and is up-to-date: {destination}\")\n", - " return\n", - "\n", - " # Define the block size for reading the file\n", - " block_size = 1024 # 1 Kilobyte\n", - "\n", - " # Initialize the progress bar with total file size\n", - " progress_bar_description = url.split(\"/\")[-1] # Extract filename from URL\n", - " with tqdm(total=file_size, unit=\"iB\", unit_scale=True, desc=progress_bar_description) as progress_bar:\n", - " # Open the destination file in binary write mode\n", - " with open(destination, \"wb\") as file:\n", - " # Iterate over the file data in chunks\n", - " for chunk in response.iter_content(block_size):\n", - " progress_bar.update(len(chunk)) # Update progress bar\n", - " file.write(chunk) # Write the chunk to the file\n", - "\n", - "\n", - "def load_gpt2_params_from_tf_ckpt(ckpt_path, hparams):\n", - " # Initialize parameters dictionary with empty blocks for each layer\n", - " params = {\"blocks\": [{} for _ in range(hparams[\"n_layer\"])]}\n", - "\n", - " # Iterate over each variable in the checkpoint\n", - " for name, _ in tf.train.list_variables(ckpt_path):\n", - " # Load the variable and remove singleton dimensions\n", - " variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))\n", - "\n", - " # Process the variable name to extract relevant parts\n", - " variable_name_parts = name.split(\"/\")[1:] # Skip the 'model/' prefix\n", - "\n", - " # Identify the target dictionary for the variable\n", - " target_dict = params\n", - " if variable_name_parts[0].startswith(\"h\"):\n", - " layer_number = int(variable_name_parts[0][1:])\n", - " target_dict = params[\"blocks\"][layer_number]\n", - "\n", - " # Recursively access or create nested dictionaries\n", - " for key in variable_name_parts[1:-1]:\n", - " target_dict = target_dict.setdefault(key, {})\n", - "\n", - " # Assign the variable array to the last key\n", - " last_key = variable_name_parts[-1]\n", - " target_dict[last_key] = variable_array\n", - "\n", - " return params" + "from gpt_download import download_and_load_gpt2" ] }, { @@ -2198,21 +2105,21 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 85, "id": "76271dd7-108d-4f5b-9c01-6ae0aac4b395", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "File already exists and is up-to-date: gpt2/124M/checkpoint\n", - "File already exists and is up-to-date: gpt2/124M/encoder.json\n", - "File already exists and is up-to-date: gpt2/124M/hparams.json\n", - "File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n", - "File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n", - "File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n", - "File already exists and is up-to-date: gpt2/124M/vocab.bpe\n" + "checkpoint: 100%|████████████████████████████| 77.0/77.0 [00:00<00:00, 132kiB/s]\n", + "encoder.json: 100%|███████████████████████| 1.04M/1.04M [00:00<00:00, 3.54MiB/s]\n", + "hparams.json: 100%|█████████████████████████| 90.0/90.0 [00:00<00:00, 52.9kiB/s]\n", + "model.ckpt.data-00000-of-00001: 100%|███████| 498M/498M [01:02<00:00, 7.93MiB/s]\n", + "model.ckpt.index: 100%|███████████████████| 5.21k/5.21k [00:00<00:00, 1.48MiB/s]\n", + "model.ckpt.meta: 100%|██████████████████████| 471k/471k [00:00<00:00, 1.93MiB/s]\n", + "vocab.bpe: 100%|████████████████████████████| 456k/456k [00:00<00:00, 2.30MiB/s]\n" ] } ], @@ -2222,7 +2129,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 86, "id": "b1a31951-d971-4a6e-9c43-11ee1168ec6a", "metadata": {}, "outputs": [ @@ -2240,7 +2147,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 87, "id": "857c8331-130e-46ba-921d-fa35d7a73cfe", "metadata": {}, "outputs": [ @@ -2286,7 +2193,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 88, "id": "9fef90dd-0654-4667-844f-08e28339ef7d", "metadata": {}, "outputs": [], @@ -2319,7 +2226,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 89, "id": "f9a92229-c002-49a6-8cfb-248297ad8296", "metadata": {}, "outputs": [], @@ -2332,7 +2239,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 90, "id": "f22d5d95-ca5a-425c-a9ec-fc432a12d4e9", "metadata": {}, "outputs": [], @@ -2385,7 +2292,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 91, "id": "1f690253-f845-4347-b7b6-43fabbd2affa", "metadata": {}, "outputs": [ @@ -2446,10 +2353,18 @@ "id": "fc7ed189-a633-458c-bf12-4f70b42684b8", "metadata": {}, "source": [ - "- See the [gpt_train.py](train_gpt.py) script containing a self-contained training script\n", - "- The [gpt_generate.py](generate_gpt.py) script loads pretrained weights from OpenAI and generates text based on a prompt\n", + "- See the [gpt_train.py](gpt_train.py) script containing a self-contained training script\n", + "- The [gpt_generate.py](gpt_generate.py) script loads pretrained weights from OpenAI and generates text based on a prompt\n", "- You can find the exercise solutions in [exercise-solutions.ipynb](exercise-solutions.ipynb)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ef9585d-ea2e-4c04-9dd5-71d003a9dd07", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/ch05/01_main-chapter-code/gpt_download.py b/ch05/01_main-chapter-code/gpt_download.py new file mode 100644 index 0000000..89f2bc7 --- /dev/null +++ b/ch05/01_main-chapter-code/gpt_download.py @@ -0,0 +1,93 @@ +import os +import requests +import json +import numpy as np +import tensorflow as tf +from tqdm import tqdm + + +def download_and_load_gpt2(model_size, models_dir): + # Validate model size + allowed_sizes = ("124M", "355M", "774M", "1558M") + if model_size not in allowed_sizes: + raise ValueError(f"Model size not in {allowed_sizes}") + + # Define paths + model_dir = os.path.join(models_dir, model_size) + base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models" + filenames = [ + "checkpoint", "encoder.json", "hparams.json", + "model.ckpt.data-00000-of-00001", "model.ckpt.index", + "model.ckpt.meta", "vocab.bpe" + ] + + # Download files + os.makedirs(model_dir, exist_ok=True) + for filename in filenames: + file_url = os.path.join(base_url, model_size, filename) + file_path = os.path.join(model_dir, filename) + download_file(file_url, file_path) + + # Load hparams and params + tf_ckpt_path = tf.train.latest_checkpoint(model_dir) + hparams = json.load(open(os.path.join(model_dir, "hparams.json"))) + params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams) + + return hparams, params + + +def download_file(url, destination): + # Send a GET request to download the file in streaming mode + response = requests.get(url, stream=True) + + # Get the total file size from headers, defaulting to 0 if not present + file_size = int(response.headers.get("content-length", 0)) + + # Check if file exists and has the same size + if os.path.exists(destination): + file_size_local = os.path.getsize(destination) + if file_size == file_size_local: + print(f"File already exists and is up-to-date: {destination}") + return + + # Define the block size for reading the file + block_size = 1024 # 1 Kilobyte + + # Initialize the progress bar with total file size + progress_bar_description = url.split("/")[-1] # Extract filename from URL + with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar: + # Open the destination file in binary write mode + with open(destination, "wb") as file: + # Iterate over the file data in chunks + for chunk in response.iter_content(block_size): + progress_bar.update(len(chunk)) # Update progress bar + file.write(chunk) # Write the chunk to the file + + +def load_gpt2_params_from_tf_ckpt(ckpt_path, hparams): + # Initialize parameters dictionary with empty blocks for each layer + params = {"blocks": [{} for _ in range(hparams["n_layer"])]} + + # Iterate over each variable in the checkpoint + for name, _ in tf.train.list_variables(ckpt_path): + # Load the variable and remove singleton dimensions + variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name)) + + # Process the variable name to extract relevant parts + variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix + + # Identify the target dictionary for the variable + target_dict = params + if variable_name_parts[0].startswith("h"): + layer_number = int(variable_name_parts[0][1:]) + target_dict = params["blocks"][layer_number] + + # Recursively access or create nested dictionaries + for key in variable_name_parts[1:-1]: + target_dict = target_dict.setdefault(key, {}) + + # Assign the variable array to the last key + last_key = variable_name_parts[-1] + target_dict[last_key] = variable_array + + return params diff --git a/ch05/01_main-chapter-code/generate.py b/ch05/01_main-chapter-code/gpt_generate.py similarity index 100% rename from ch05/01_main-chapter-code/generate.py rename to ch05/01_main-chapter-code/gpt_generate.py diff --git a/ch05/01_main-chapter-code/train.py b/ch05/01_main-chapter-code/gpt_train.py similarity index 100% rename from ch05/01_main-chapter-code/train.py rename to ch05/01_main-chapter-code/gpt_train.py