Merge pull request #174 from rasbt/update-regex

update regex
This commit is contained in:
Sebastian Raschka 2024-05-22 21:37:50 -04:00 committed by GitHub
commit ec70194d19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -341,7 +341,7 @@
}
],
"source": [
"preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n",
"preprocessed = re.split(r'([,.:;?_!\"()\\']|--|\\s)', raw_text)\n",
"preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
"print(preprocessed[:30])"
]
@ -364,7 +364,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"4649\n"
"4690\n"
]
}
],
@ -414,7 +414,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1159\n"
"1130\n"
]
}
],
@ -478,32 +478,32 @@
"('But', 22)\n",
"('By', 23)\n",
"('Carlo', 24)\n",
"('Carlo;', 25)\n",
"('Chicago', 26)\n",
"('Claude', 27)\n",
"('Come', 28)\n",
"('Croft', 29)\n",
"('Destroyed', 30)\n",
"('Devonshire', 31)\n",
"('Don', 32)\n",
"('Dubarry', 33)\n",
"('Emperors', 34)\n",
"('Florence', 35)\n",
"('For', 36)\n",
"('Gallery', 37)\n",
"('Gideon', 38)\n",
"('Gisburn', 39)\n",
"('Gisburns', 40)\n",
"('Grafton', 41)\n",
"('Greek', 42)\n",
"('Grindle', 43)\n",
"('Grindle:', 44)\n",
"('Grindles', 45)\n",
"('HAD', 46)\n",
"('Had', 47)\n",
"('Hang', 48)\n",
"('Has', 49)\n",
"('He', 50)\n"
"('Chicago', 25)\n",
"('Claude', 26)\n",
"('Come', 27)\n",
"('Croft', 28)\n",
"('Destroyed', 29)\n",
"('Devonshire', 30)\n",
"('Don', 31)\n",
"('Dubarry', 32)\n",
"('Emperors', 33)\n",
"('Florence', 34)\n",
"('For', 35)\n",
"('Gallery', 36)\n",
"('Gideon', 37)\n",
"('Gisburn', 38)\n",
"('Gisburns', 39)\n",
"('Grafton', 40)\n",
"('Greek', 41)\n",
"('Grindle', 42)\n",
"('Grindles', 43)\n",
"('HAD', 44)\n",
"('Had', 45)\n",
"('Hang', 46)\n",
"('Has', 47)\n",
"('He', 48)\n",
"('Her', 49)\n",
"('Hermia', 50)\n"
]
}
],
@ -601,7 +601,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]\n"
"[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]\n"
]
}
],