diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index 18d3d91..9853ac4 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -861,7 +861,7 @@ "metadata": {}, "source": [ "- Next, let's initialize and train the BPE tokenizer with a vocabulary size of 1,000\n", - "- Note that the vocabulary size is already 255 by default due to the byte values discussed earlier, so we are only \"learning\" 745 vocabulary entries \n", + "- Note that the vocabulary size is already 256 by default due to the byte values discussed earlier, so we are only \"learning\" 744 vocabulary entries (if we consider the `<|endoftext|>` special token and the `Ġ` whitespace token; so, that's 742 to be precise)\n", "- For comparison, the GPT-2 vocabulary is 50,257 tokens, the GPT-4 vocabulary is 100,256 tokens (`cl100k_base` in tiktoken), and GPT-4o uses 199,997 tokens (`o200k_base` in tiktoken); they have all much bigger training sets compared to our simple example text above" ] }, @@ -908,7 +908,7 @@ "id": "36c9da0f-8a18-41cd-91ea-9ccc2bb5febb", "metadata": {}, "source": [ - "- This vocabulary is created by merging 742 times (~ `1000 - len(range(0, 256))`)" + "- This vocabulary is created by merging 742 times (`= 1000 - len(range(0, 256)) - len(special_tokens) - \"Ġ\" = 1000 - 256 - 1 - 1 = 742`)" ] }, { @@ -975,12 +975,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 256, 60, 124, 271, 683, 102, 116, 461, 116, 124, 62]\n" + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 60, 124, 271, 683, 102, 116, 461, 116, 124, 62]\n" ] } ], "source": [ - "input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n", + "input_text = \"Jack embraced beauty through art and life.<|endoftext|> \"\n", "token_ids = tokenizer.encode(input_text)\n", "print(token_ids)" ] @@ -1000,7 +1000,7 @@ } ], "source": [ - "input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n", + "input_text = \"Jack embraced beauty through art and life.<|endoftext|> \"\n", "token_ids = tokenizer.encode(input_text, allowed_special={\"<|endoftext|>\"})\n", "print(token_ids)" ] @@ -1015,7 +1015,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of characters: 57\n", + "Number of characters: 56\n", "Number of token IDs: 21\n" ] }