mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-15 21:10:46 +00:00
26 lines
643 B
Python
26 lines
643 B
Python
import re
|
|
|
|
|
|
def clean_wiki_text(text):
|
|
# get rid of multiple new lines
|
|
while "\n\n" in text:
|
|
text = text.replace("\n\n", "\n")
|
|
|
|
# remove extremely short lines
|
|
text = text.split("\n")
|
|
cleaned = []
|
|
for l in text:
|
|
if len(l) > 30:
|
|
cleaned.append(l)
|
|
elif l[:2] == "==" and l[-2:] == "==":
|
|
cleaned.append(l)
|
|
text = "\n".join(cleaned)
|
|
|
|
# add paragraphs (identified by wiki section title which is always in format "==Some Title==")
|
|
text = text.replace("\n==", "\n\n\n==")
|
|
|
|
# remove empty paragrahps
|
|
text = re.sub(r"(==.*==\n\n\n)", "", text)
|
|
|
|
return text
|