diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index bdfcd03..d91527b 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -341,7 +341,7 @@ } ], "source": [ - "preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n", + "preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', raw_text)\n", "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", "print(preprocessed[:30])" ] @@ -364,7 +364,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "4649\n" + "4690\n" ] } ], @@ -414,7 +414,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1159\n" + "1130\n" ] } ], @@ -478,32 +478,32 @@ "('But', 22)\n", "('By', 23)\n", "('Carlo', 24)\n", - "('Carlo;', 25)\n", - "('Chicago', 26)\n", - "('Claude', 27)\n", - "('Come', 28)\n", - "('Croft', 29)\n", - "('Destroyed', 30)\n", - "('Devonshire', 31)\n", - "('Don', 32)\n", - "('Dubarry', 33)\n", - "('Emperors', 34)\n", - "('Florence', 35)\n", - "('For', 36)\n", - "('Gallery', 37)\n", - "('Gideon', 38)\n", - "('Gisburn', 39)\n", - "('Gisburns', 40)\n", - "('Grafton', 41)\n", - "('Greek', 42)\n", - "('Grindle', 43)\n", - "('Grindle:', 44)\n", - "('Grindles', 45)\n", - "('HAD', 46)\n", - "('Had', 47)\n", - "('Hang', 48)\n", - "('Has', 49)\n", - "('He', 50)\n" + "('Chicago', 25)\n", + "('Claude', 26)\n", + "('Come', 27)\n", + "('Croft', 28)\n", + "('Destroyed', 29)\n", + "('Devonshire', 30)\n", + "('Don', 31)\n", + "('Dubarry', 32)\n", + "('Emperors', 33)\n", + "('Florence', 34)\n", + "('For', 35)\n", + "('Gallery', 36)\n", + "('Gideon', 37)\n", + "('Gisburn', 38)\n", + "('Gisburns', 39)\n", + "('Grafton', 40)\n", + "('Greek', 41)\n", + "('Grindle', 42)\n", + "('Grindles', 43)\n", + "('HAD', 44)\n", + "('Had', 45)\n", + "('Hang', 46)\n", + "('Has', 47)\n", + "('He', 48)\n", + "('Her', 49)\n", + "('Hermia', 50)\n" ] } ], @@ -601,7 +601,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]\n" + "[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]\n" ] } ],