Fix download ntlk preprocessor (#852)

This commit is contained in:
Tu NGUYEN 2021-02-21 10:17:50 +01:00 committed by GitHub
parent e641bff7a6
commit ba91a90dd6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -44,7 +44,11 @@ class PreProcessor(BasePreProcessor):
to True, the individual split will always have complete sentences &
the number of words will be <= split_length.
"""
nltk.download("punkt")
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
self.clean_whitespace = clean_whitespace
self.clean_header_footer = clean_header_footer
self.clean_empty_lines = clean_empty_lines