diff --git a/appendix-A/.DS_Store b/appendix-A/.DS_Store deleted file mode 100644 index cc77c08..0000000 Binary files a/appendix-A/.DS_Store and /dev/null differ diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 5c9cfae..c0cc12b 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -580,9 +580,8 @@ "preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n", "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", "\n", - "all_words = sorted(list(set(preprocessed)))\n", - "all_tokens = all_words\n", - "all_words.extend([\"<|endoftext|>\", \"<|unk|>\"])\n", + "all_tokens = sorted(list(set(preprocessed)))\n", + "all_tokens.extend([\"<|endoftext|>\", \"<|unk|>\"])\n", "\n", "vocab = {token:integer for integer,token in enumerate(all_tokens)}" ] @@ -1626,7 +1625,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/ch03/01_main-chapter-code/ch03.ipynb b/ch03/01_main-chapter-code/ch03.ipynb index f1e54be..26ec9ce 100644 --- a/ch03/01_main-chapter-code/ch03.ipynb +++ b/ch03/01_main-chapter-code/ch03.ipynb @@ -90,7 +90,7 @@ "id": "d269e9f1-df11-4644-b575-df338cf46cdf", "metadata": {}, "source": [ - "- This section explains a very simplified variant of self-attention, which does not contain any trainable weights. This is purely for illustration purposes and NOT the attention mechanism that is used in transformers. The next section, section 3.4, will extend this simple attention mechanism to implement the real self-attention mechanism.\n", + "- This section explains a very simplified variant of self-attention, which does not contain any trainable weights. This is purely for illustration purposes and NOT the attention mechanism that is used in transformers. The next section, section 3.3.2, will extend this simple attention mechanism to implement the real self-attention mechanism.\n", "- Suppose we are given an input sequence $x^{(1)}$ to $x^{(T)}$.\n", " - The input is a text (for example, a sentence like \"Your journey starts with one step\") that has already been converted into token embeddings as described in chapter 2.\n", " - For instance, $x^{(1)}$ is a d-dimensional vector representing the word \"Your\", and so forth.\n", @@ -623,7 +623,7 @@ "metadata": {}, "source": [ "- Implementing the self-attention mechanism step by step, we will start by introducing the three training weight matrices $W_q$, $W_k$, and $W_v$.\n", - "- These three matrices are used to project the embedded input tokens, $x^{(i)}$ into query, key, and value vectors via matrix multiplication:\n", + "- These three matrices are used to project the embedded input tokens, $x^{(i)}$, into query, key, and value vectors via matrix multiplication:\n", "\n", " - Query vector: $q^{(i)} = W_q \\,x^{(i)}$\n", " - Key vector: $k^{(i)} = W_k \\,x^{(i)}$\n", @@ -644,7 +644,7 @@ "metadata": {}, "source": [ "- The embedding dimensions of the input $x$ and the query vector $q$ can be the same or different, depending on the model's design and specific implementation.\n", - "- In GPT models, the dimensions are usually the same, but for illustration purposes, to better follow the computation, we choose different input and output dimensions here:" + "- In GPT models, the input and output dimensions are usually the same, but for illustration purposes, to better follow the computation, we choose different input and output dimensions here:" ] }, { @@ -654,7 +654,7 @@ "metadata": {}, "outputs": [], "source": [ - "x = inputs[1] # second input element\n", + "x_2 = inputs[1] # second input element\n", "d_in = inputs.shape[1] # the input embedding size, d=3\n", "d_out = 2 # the output embedding size, d=2" ] @@ -704,9 +704,9 @@ } ], "source": [ - "query_2 = x @ W_query # _2 because it's with respect to the 2nd input element\n", - "key_2 = x @ W_key \n", - "value_2 = x @ W_value\n", + "query_2 = x_2 @ W_query # _2 because it's with respect to the 2nd input element\n", + "key_2 = x_2 @ W_key \n", + "value_2 = x_2 @ W_value\n", "\n", "print(query_2)" ] @@ -998,7 +998,7 @@ "id": "915cd8a5-a895-42c9-8b8e-06b5ae19ffce", "metadata": {}, "source": [ - "- Note that `SelfAttention_v1` and `SelfAttention_v2` give different outputs because they use different initial weighs for the weight matrices." + "- Note that `SelfAttention_v1` and `SelfAttention_v2` give different outputs because they use different initial weights for the weight matrices." ] }, { @@ -1823,7 +1823,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4,