From 3a5fc79b38a0bc032c64ee5e752f9c0e95639aba Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Mon, 5 Feb 2024 06:51:58 -0600
Subject: [PATCH] add and update readme files

---
 ch02/01_main-chapter-code/README.md  |  2 +-
 ch02/README.md                       |  2 +-
 ch03/01_main-chapter-code/README.md  |  4 ++--
 ch03/README.md                       |  2 +-
 ch04/01_main-chapter-code/README.md  |  6 ++++++
 ch04/01_main-chapter-code/ch04.ipynb | 12 ++++++++----
 ch04/README.md                       |  3 +++
 7 files changed, 22 insertions(+), 9 deletions(-)
 create mode 100644 ch04/01_main-chapter-code/README.md
 create mode 100644 ch04/README.md

diff --git a/ch02/01_main-chapter-code/README.md b/ch02/01_main-chapter-code/README.md
index 646bf68..9286cf8 100644
--- a/ch02/01_main-chapter-code/README.md
+++ b/ch02/01_main-chapter-code/README.md
@@ -1,5 +1,5 @@
 # Chapter 2: Working with Text Data
 
-- [ch02.ipynb](ch02.ipynb) has all the code as it appears in the chapter
+- [ch02.ipynb](ch02.ipynb) contains all the code as it appears in the chapter
 - [dataloader.ipynb](dataloader.ipynb) is a minimal notebook with the main data loading pipeline implemented in this chapter
 
diff --git a/ch02/README.md b/ch02/README.md
index 7c085a9..bd98860 100644
--- a/ch02/README.md
+++ b/ch02/README.md
@@ -1,6 +1,6 @@
 # Chapter 2: Working with Text Data
 
-- [01_main-chapter-code](01_main-chapter-code) contains the main chapter code
+- [01_main-chapter-code](01_main-chapter-code) contains the main chapter code and exercise solutions
   
 - [02_bonus_bytepair-encoder](02_bonus_bytepair-encoder) contains optional code to benchmark different byte pair encoder implementations
   
diff --git a/ch03/01_main-chapter-code/README.md b/ch03/01_main-chapter-code/README.md
index ef8457e..44d8b46 100644
--- a/ch03/01_main-chapter-code/README.md
+++ b/ch03/01_main-chapter-code/README.md
@@ -1,5 +1,5 @@
-# Chapter 3: Understanding Attention Mechanisms
+# Chapter 3: Coding Attention Mechanisms
 
-- [ch03.ipynb](ch03.ipynb) has all the code as it appears in the chapter
+- [ch03.ipynb](ch03.ipynb) contains all the code as it appears in the chapter
 - [multihead-attention.ipynb](multihead-attention.ipynb) is a minimal notebook with the main data loading pipeline implemented in this chapter
 
diff --git a/ch03/README.md b/ch03/README.md
index 9545007..846044b 100644
--- a/ch03/README.md
+++ b/ch03/README.md
@@ -1,3 +1,3 @@
-# Chapter 3: Understanding Attention Mechanisms
+# Chapter 3: Coding Attention Mechanisms
 
 - [01_main-chapter-code](01_main-chapter-code) contains the main chapter code.
\ No newline at end of file
diff --git a/ch04/01_main-chapter-code/README.md b/ch04/01_main-chapter-code/README.md
new file mode 100644
index 0000000..7d22944
--- /dev/null
+++ b/ch04/01_main-chapter-code/README.md
@@ -0,0 +1,6 @@
+# Chapter 4: Implementing a GPT model from Scratch To Generate Text
+
+- [ch04.ipynb](ch04.ipynb) contains all the code as it appears in the chapter
+- [previous_chapters.py](previous_chapters.py) is a Python module that contains the `MultiHeadAttention` module from the previous chapter, which we import in [ch04.ipynb](ch04.ipynb) to create the GPT model
+- [gpt.py](gpt.py) is a standalone Python script file with the code that we implemented thus far, including the GPT model we coded in this chapter
+
diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb
index 0658728..742c962 100644
--- a/ch04/01_main-chapter-code/ch04.ipynb
+++ b/ch04/01_main-chapter-code/ch04.ipynb
@@ -134,7 +134,9 @@
     "        \n",
     "        # Use a placeholder for LayerNorm\n",
     "        self.final_norm = DummyLayerNorm(cfg[\"emb_dim\"])\n",
-    "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
+    "        self.out_head = nn.Linear(\n",
+    "            cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False\n",
+    "        )\n",
     "\n",
     "    def forward(self, in_idx):\n",
     "        batch_size, seq_len = in_idx.shape\n",
@@ -208,7 +210,7 @@
     "batch.append(torch.tensor(tokenizer.encode(txt1)))\n",
     "batch.append(torch.tensor(tokenizer.encode(txt2)))\n",
     "batch = torch.stack(batch, dim=0)\n",
-    "batch"
+    "print(batch)"
    ]
   },
   {
@@ -772,7 +774,7 @@
     "torch.manual_seed(123)\n",
     "ex_short = ExampleWithShortcut()\n",
     "inputs = torch.tensor([[-1., 1., 2.]])\n",
-    "ex_short(inputs)"
+    "print(ex_short(inputs))"
    ]
   },
   {
@@ -947,7 +949,9 @@
     "        \n",
     "        # Use a placeholder for LayerNorm\n",
     "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
-    "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False)\n",
+    "        self.out_head = nn.Linear(\n",
+    "            cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False\n",
+    "        )\n",
     "\n",
     "    def forward(self, in_idx):\n",
     "        batch_size, seq_len = in_idx.shape\n",
diff --git a/ch04/README.md b/ch04/README.md
new file mode 100644
index 0000000..43db748
--- /dev/null
+++ b/ch04/README.md
@@ -0,0 +1,3 @@
+# Chapter 4: Implementing a GPT model from Scratch To Generate Text
+
+- [01_main-chapter-code](01_main-chapter-code) contains the main chapter code.
\ No newline at end of file