diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 4948282..902e7d1 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -785,9 +785,6 @@ "metadata": {}, "outputs": [], "source": [ - "preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n", - "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", - "\n", "all_tokens = sorted(list(set(preprocessed)))\n", "all_tokens.extend([\"<|endoftext|>\", \"<|unk|>\"])\n", "\n", @@ -803,7 +800,7 @@ { "data": { "text/plain": [ - "1161" + "1132" ] }, "execution_count": 19, @@ -825,11 +822,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "('younger', 1156)\n", - "('your', 1157)\n", - "('yourself', 1158)\n", - "('<|endoftext|>', 1159)\n", - "('<|unk|>', 1160)\n" + "('younger', 1127)\n", + "('your', 1128)\n", + "('yourself', 1129)\n", + "('<|endoftext|>', 1130)\n", + "('<|unk|>', 1131)\n" ] } ], @@ -918,22 +915,7 @@ { "data": { "text/plain": [ - "[1160,\n", - " 5,\n", - " 362,\n", - " 1155,\n", - " 642,\n", - " 1000,\n", - " 10,\n", - " 1159,\n", - " 57,\n", - " 1013,\n", - " 981,\n", - " 1009,\n", - " 738,\n", - " 1013,\n", - " 1160,\n", - " 7]" + "[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]" ] }, "execution_count": 23,