Merge pull request #174 from rasbt/update-regex

update regex
This commit is contained in:
Sebastian Raschka 2024-05-22 21:37:50 -04:00 committed by GitHub
commit ec70194d19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -341,7 +341,7 @@
} }
], ],
"source": [ "source": [
"preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n", "preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', raw_text)\n",
"preprocessed = [item.strip() for item in preprocessed if item.strip()]\n", "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
"print(preprocessed[:30])" "print(preprocessed[:30])"
] ]
@ -364,7 +364,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"4649\n" "4690\n"
] ]
} }
], ],
@ -414,7 +414,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"1159\n" "1130\n"
] ]
} }
], ],
@ -478,32 +478,32 @@
"('But', 22)\n", "('But', 22)\n",
"('By', 23)\n", "('By', 23)\n",
"('Carlo', 24)\n", "('Carlo', 24)\n",
"('Carlo;', 25)\n", "('Chicago', 25)\n",
"('Chicago', 26)\n", "('Claude', 26)\n",
"('Claude', 27)\n", "('Come', 27)\n",
"('Come', 28)\n", "('Croft', 28)\n",
"('Croft', 29)\n", "('Destroyed', 29)\n",
"('Destroyed', 30)\n", "('Devonshire', 30)\n",
"('Devonshire', 31)\n", "('Don', 31)\n",
"('Don', 32)\n", "('Dubarry', 32)\n",
"('Dubarry', 33)\n", "('Emperors', 33)\n",
"('Emperors', 34)\n", "('Florence', 34)\n",
"('Florence', 35)\n", "('For', 35)\n",
"('For', 36)\n", "('Gallery', 36)\n",
"('Gallery', 37)\n", "('Gideon', 37)\n",
"('Gideon', 38)\n", "('Gisburn', 38)\n",
"('Gisburn', 39)\n", "('Gisburns', 39)\n",
"('Gisburns', 40)\n", "('Grafton', 40)\n",
"('Grafton', 41)\n", "('Greek', 41)\n",
"('Greek', 42)\n", "('Grindle', 42)\n",
"('Grindle', 43)\n", "('Grindles', 43)\n",
"('Grindle:', 44)\n", "('HAD', 44)\n",
"('Grindles', 45)\n", "('Had', 45)\n",
"('HAD', 46)\n", "('Hang', 46)\n",
"('Had', 47)\n", "('Has', 47)\n",
"('Hang', 48)\n", "('He', 48)\n",
"('Has', 49)\n", "('Her', 49)\n",
"('He', 50)\n" "('Hermia', 50)\n"
] ]
} }
], ],
@ -601,7 +601,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]\n" "[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]\n"
] ]
} }
], ],