2019-11-14 11:42:51 +01:00

26 lines
643 B
Python

import re
def clean_wiki_text(text):
# get rid of multiple new lines
while "\n\n" in text:
text = text.replace("\n\n", "\n")
# remove extremely short lines
text = text.split("\n")
cleaned = []
for l in text:
if len(l) > 30:
cleaned.append(l)
elif l[:2] == "==" and l[-2:] == "==":
cleaned.append(l)
text = "\n".join(cleaned)
# add paragraphs (identified by wiki section title which is always in format "==Some Title==")
text = text.replace("\n==", "\n\n\n==")
# remove empty paragrahps
text = re.sub(r"(==.*==\n\n\n)", "", text)
return text