import re def clean_wiki_text(text): # get rid of multiple new lines while "\n\n" in text: text = text.replace("\n\n", "\n") # remove extremely short lines text = text.split("\n") cleaned = [] for l in text: if len(l) > 30: cleaned.append(l) elif l[:2] == "==" and l[-2:] == "==": cleaned.append(l) text = "\n".join(cleaned) # add paragraphs (identified by wiki section title which is always in format "==Some Title==") text = text.replace("\n==", "\n\n\n==") # remove empty paragrahps text = re.sub(r"(==.*==\n\n\n)", "", text) return text